blob: c1550720735e4b301f0cb3b79d4155ce776f6761 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
157 unicode->length = length;
158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000266int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
371 for (i = size; i >= 0; i--)
372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
387 if (size > PyUnicode_GET_SIZE(unicode))
388 size = PyUnicode_GET_SIZE(unicode);
389#ifdef HAVE_USABLE_WCHAR_T
390 memcpy(w, unicode->str, size * sizeof(wchar_t));
391#else
392 {
393 register Py_UNICODE *u;
394 register int i;
395 u = PyUnicode_AS_UNICODE(unicode);
396 for (i = size; i >= 0; i--)
397 *w++ = *u++;
398 }
399#endif
400
401 return size;
402}
403
404#endif
405
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000406PyObject *PyUnicode_FromOrdinal(int ordinal)
407{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000408 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000409
410#ifdef Py_UNICODE_WIDE
411 if (ordinal < 0 || ordinal > 0x10ffff) {
412 PyErr_SetString(PyExc_ValueError,
413 "unichr() arg not in range(0x110000) "
414 "(wide Python build)");
415 return NULL;
416 }
417#else
418 if (ordinal < 0 || ordinal > 0xffff) {
419 PyErr_SetString(PyExc_ValueError,
420 "unichr() arg not in range(0x10000) "
421 "(narrow Python build)");
422 return NULL;
423 }
424#endif
425
Hye-Shik Chang40574832004-04-06 07:24:51 +0000426 s[0] = (Py_UNICODE)ordinal;
427 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000428}
429
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430PyObject *PyUnicode_FromObject(register PyObject *obj)
431{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000432 /* XXX Perhaps we should make this API an alias of
433 PyObject_Unicode() instead ?! */
434 if (PyUnicode_CheckExact(obj)) {
435 Py_INCREF(obj);
436 return obj;
437 }
438 if (PyUnicode_Check(obj)) {
439 /* For a Unicode subtype that's not a Unicode object,
440 return a true Unicode object with the same data. */
441 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
442 PyUnicode_GET_SIZE(obj));
443 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000444 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
445}
446
447PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
448 const char *encoding,
449 const char *errors)
450{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000451 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000453 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000454
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455 if (obj == NULL) {
456 PyErr_BadInternalCall();
457 return NULL;
458 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000460#if 0
461 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000462 that no encodings is given and then redirect to
463 PyObject_Unicode() which then applies the additional logic for
464 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000465
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000466 NOTE: This API should really only be used for object which
467 represent *encoded* Unicode !
468
469 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000470 if (PyUnicode_Check(obj)) {
471 if (encoding) {
472 PyErr_SetString(PyExc_TypeError,
473 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000474 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000475 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000476 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000477 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000478#else
479 if (PyUnicode_Check(obj)) {
480 PyErr_SetString(PyExc_TypeError,
481 "decoding Unicode is not supported");
482 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000484#endif
485
486 /* Coerce object */
487 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000488 s = PyString_AS_STRING(obj);
489 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000490 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
492 /* Overwrite the error message with something more useful in
493 case of a TypeError. */
494 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000495 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000496 "coercing to Unicode: need string or buffer, "
497 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000498 obj->ob_type->tp_name);
499 goto onError;
500 }
Tim Petersced69f82003-09-16 20:30:58 +0000501
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000502 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 if (len == 0) {
504 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000505 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000509
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000510 return v;
511
512 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514}
515
516PyObject *PyUnicode_Decode(const char *s,
517 int size,
518 const char *encoding,
519 const char *errors)
520{
521 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000522
523 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000524 encoding = PyUnicode_GetDefaultEncoding();
525
526 /* Shortcuts for common default encodings */
527 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000529 else if (strcmp(encoding, "latin-1") == 0)
530 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000531#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
532 else if (strcmp(encoding, "mbcs") == 0)
533 return PyUnicode_DecodeMBCS(s, size, errors);
534#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000535 else if (strcmp(encoding, "ascii") == 0)
536 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000537
538 /* Decode via the codec registry */
539 buffer = PyBuffer_FromMemory((void *)s, size);
540 if (buffer == NULL)
541 goto onError;
542 unicode = PyCodec_Decode(buffer, encoding, errors);
543 if (unicode == NULL)
544 goto onError;
545 if (!PyUnicode_Check(unicode)) {
546 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000547 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548 unicode->ob_type->tp_name);
549 Py_DECREF(unicode);
550 goto onError;
551 }
552 Py_DECREF(buffer);
553 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000554
Guido van Rossumd57fd912000-03-10 22:53:23 +0000555 onError:
556 Py_XDECREF(buffer);
557 return NULL;
558}
559
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000560PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
561 const char *encoding,
562 const char *errors)
563{
564 PyObject *v;
565
566 if (!PyUnicode_Check(unicode)) {
567 PyErr_BadArgument();
568 goto onError;
569 }
570
571 if (encoding == NULL)
572 encoding = PyUnicode_GetDefaultEncoding();
573
574 /* Decode via the codec registry */
575 v = PyCodec_Decode(unicode, encoding, errors);
576 if (v == NULL)
577 goto onError;
578 return v;
579
580 onError:
581 return NULL;
582}
583
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584PyObject *PyUnicode_Encode(const Py_UNICODE *s,
585 int size,
586 const char *encoding,
587 const char *errors)
588{
589 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000590
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591 unicode = PyUnicode_FromUnicode(s, size);
592 if (unicode == NULL)
593 return NULL;
594 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
595 Py_DECREF(unicode);
596 return v;
597}
598
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000599PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
600 const char *encoding,
601 const char *errors)
602{
603 PyObject *v;
604
605 if (!PyUnicode_Check(unicode)) {
606 PyErr_BadArgument();
607 goto onError;
608 }
609
610 if (encoding == NULL)
611 encoding = PyUnicode_GetDefaultEncoding();
612
613 /* Encode via the codec registry */
614 v = PyCodec_Encode(unicode, encoding, errors);
615 if (v == NULL)
616 goto onError;
617 return v;
618
619 onError:
620 return NULL;
621}
622
Guido van Rossumd57fd912000-03-10 22:53:23 +0000623PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
624 const char *encoding,
625 const char *errors)
626{
627 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629 if (!PyUnicode_Check(unicode)) {
630 PyErr_BadArgument();
631 goto onError;
632 }
Fred Drakee4315f52000-05-09 19:53:39 +0000633
Tim Petersced69f82003-09-16 20:30:58 +0000634 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000635 encoding = PyUnicode_GetDefaultEncoding();
636
637 /* Shortcuts for common default encodings */
638 if (errors == NULL) {
639 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000640 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000641 else if (strcmp(encoding, "latin-1") == 0)
642 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000643#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
644 else if (strcmp(encoding, "mbcs") == 0)
645 return PyUnicode_AsMBCSString(unicode);
646#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000647 else if (strcmp(encoding, "ascii") == 0)
648 return PyUnicode_AsASCIIString(unicode);
649 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650
651 /* Encode via the codec registry */
652 v = PyCodec_Encode(unicode, encoding, errors);
653 if (v == NULL)
654 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 if (!PyString_Check(v)) {
656 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000657 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 v->ob_type->tp_name);
659 Py_DECREF(v);
660 goto onError;
661 }
662 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000663
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 onError:
665 return NULL;
666}
667
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000668PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
669 const char *errors)
670{
671 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
672
673 if (v)
674 return v;
675 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
676 if (v && errors == NULL)
677 ((PyUnicodeObject *)unicode)->defenc = v;
678 return v;
679}
680
Guido van Rossumd57fd912000-03-10 22:53:23 +0000681Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
682{
683 if (!PyUnicode_Check(unicode)) {
684 PyErr_BadArgument();
685 goto onError;
686 }
687 return PyUnicode_AS_UNICODE(unicode);
688
689 onError:
690 return NULL;
691}
692
693int PyUnicode_GetSize(PyObject *unicode)
694{
695 if (!PyUnicode_Check(unicode)) {
696 PyErr_BadArgument();
697 goto onError;
698 }
699 return PyUnicode_GET_SIZE(unicode);
700
701 onError:
702 return -1;
703}
704
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000705int PyUnicode_GetWidth(PyObject *unicode)
706{
707 const Py_UNICODE *p, *e;
708 int width;
709
710 if (!PyUnicode_Check(unicode)) {
711 PyErr_BadArgument();
712 return -1;
713 }
714
715 p = PyUnicode_AS_UNICODE(unicode);
716 e = p + PyUnicode_GET_SIZE(unicode);
717 for (width = 0; p < e; p++)
718 if (Py_UNICODE_ISWIDE(*p))
719 width += 2;
720 else
721 width++;
722
723 return width;
724}
725
Thomas Wouters78890102000-07-22 19:25:51 +0000726const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000727{
728 return unicode_default_encoding;
729}
730
731int PyUnicode_SetDefaultEncoding(const char *encoding)
732{
733 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000734
Fred Drakee4315f52000-05-09 19:53:39 +0000735 /* Make sure the encoding is valid. As side effect, this also
736 loads the encoding into the codec registry cache. */
737 v = _PyCodec_Lookup(encoding);
738 if (v == NULL)
739 goto onError;
740 Py_DECREF(v);
741 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000742 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000743 sizeof(unicode_default_encoding));
744 return 0;
745
746 onError:
747 return -1;
748}
749
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000750/* error handling callback helper:
751 build arguments, call the callback and check the arguments,
752 if no exception occured, copy the replacement to the output
753 and adjust various state variables.
754 return 0 on success, -1 on error
755*/
756
757static
758int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
759 const char *encoding, const char *reason,
760 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
761 PyObject **output, int *outpos, Py_UNICODE **outptr)
762{
763 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
764
765 PyObject *restuple = NULL;
766 PyObject *repunicode = NULL;
767 int outsize = PyUnicode_GET_SIZE(*output);
768 int requiredsize;
769 int newpos;
770 Py_UNICODE *repptr;
771 int repsize;
772 int res = -1;
773
774 if (*errorHandler == NULL) {
775 *errorHandler = PyCodec_LookupError(errors);
776 if (*errorHandler == NULL)
777 goto onError;
778 }
779
780 if (*exceptionObject == NULL) {
781 *exceptionObject = PyUnicodeDecodeError_Create(
782 encoding, input, insize, *startinpos, *endinpos, reason);
783 if (*exceptionObject == NULL)
784 goto onError;
785 }
786 else {
787 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
788 goto onError;
789 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
790 goto onError;
791 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
792 goto onError;
793 }
794
795 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
796 if (restuple == NULL)
797 goto onError;
798 if (!PyTuple_Check(restuple)) {
799 PyErr_Format(PyExc_TypeError, &argparse[4]);
800 goto onError;
801 }
802 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
803 goto onError;
804 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000805 newpos = insize+newpos;
806 if (newpos<0 || newpos>insize) {
807 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
808 goto onError;
809 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000810
811 /* need more space? (at least enough for what we
812 have+the replacement+the rest of the string (starting
813 at the new input position), so we won't have to check space
814 when there are no errors in the rest of the string) */
815 repptr = PyUnicode_AS_UNICODE(repunicode);
816 repsize = PyUnicode_GET_SIZE(repunicode);
817 requiredsize = *outpos + repsize + insize-newpos;
818 if (requiredsize > outsize) {
819 if (requiredsize<2*outsize)
820 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000821 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000822 goto onError;
823 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
824 }
825 *endinpos = newpos;
826 *inptr = input + newpos;
827 Py_UNICODE_COPY(*outptr, repptr, repsize);
828 *outptr += repsize;
829 *outpos += repsize;
830 /* we made it! */
831 res = 0;
832
833 onError:
834 Py_XDECREF(restuple);
835 return res;
836}
837
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000838/* --- UTF-7 Codec -------------------------------------------------------- */
839
840/* see RFC2152 for details */
841
Tim Petersced69f82003-09-16 20:30:58 +0000842static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000843char utf7_special[128] = {
844 /* indicate whether a UTF-7 character is special i.e. cannot be directly
845 encoded:
846 0 - not special
847 1 - special
848 2 - whitespace (optional)
849 3 - RFC2152 Set O (optional) */
850 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
851 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
852 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
853 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
854 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
855 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
856 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
857 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
858
859};
860
861#define SPECIAL(c, encodeO, encodeWS) \
862 (((c)>127 || utf7_special[(c)] == 1) || \
863 (encodeWS && (utf7_special[(c)] == 2)) || \
864 (encodeO && (utf7_special[(c)] == 3)))
865
866#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
867#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
868#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
869 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
870
871#define ENCODE(out, ch, bits) \
872 while (bits >= 6) { \
873 *out++ = B64(ch >> (bits-6)); \
874 bits -= 6; \
875 }
876
877#define DECODE(out, ch, bits, surrogate) \
878 while (bits >= 16) { \
879 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
880 bits -= 16; \
881 if (surrogate) { \
882 /* We have already generated an error for the high surrogate
883 so let's not bother seeing if the low surrogate is correct or not */\
884 surrogate = 0; \
885 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
886 /* This is a surrogate pair. Unfortunately we can't represent \
887 it in a 16-bit character */ \
888 surrogate = 1; \
889 errmsg = "code pairs are not supported"; \
890 goto utf7Error; \
891 } else { \
892 *out++ = outCh; \
893 } \
894 } \
895
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000896PyObject *PyUnicode_DecodeUTF7(const char *s,
897 int size,
898 const char *errors)
899{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000900 const char *starts = s;
901 int startinpos;
902 int endinpos;
903 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000904 const char *e;
905 PyUnicodeObject *unicode;
906 Py_UNICODE *p;
907 const char *errmsg = "";
908 int inShift = 0;
909 unsigned int bitsleft = 0;
910 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000911 int surrogate = 0;
912 PyObject *errorHandler = NULL;
913 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000914
915 unicode = _PyUnicode_New(size);
916 if (!unicode)
917 return NULL;
918 if (size == 0)
919 return (PyObject *)unicode;
920
921 p = unicode->str;
922 e = s + size;
923
924 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000925 Py_UNICODE ch;
926 restart:
927 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000928
929 if (inShift) {
930 if ((ch == '-') || !B64CHAR(ch)) {
931 inShift = 0;
932 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000933
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000934 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
935 if (bitsleft >= 6) {
936 /* The shift sequence has a partial character in it. If
937 bitsleft < 6 then we could just classify it as padding
938 but that is not the case here */
939
940 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000941 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942 }
943 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000944 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000945 here so indicate the potential of a misencoded character. */
946
947 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
948 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
949 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000950 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000951 }
952
953 if (ch == '-') {
954 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000955 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000956 inShift = 1;
957 }
958 } else if (SPECIAL(ch,0,0)) {
959 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000960 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000961 } else {
962 *p++ = ch;
963 }
964 } else {
965 charsleft = (charsleft << 6) | UB64(ch);
966 bitsleft += 6;
967 s++;
968 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
969 }
970 }
971 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000972 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000973 s++;
974 if (s < e && *s == '-') {
975 s++;
976 *p++ = '+';
977 } else
978 {
979 inShift = 1;
980 bitsleft = 0;
981 }
982 }
983 else if (SPECIAL(ch,0,0)) {
984 errmsg = "unexpected special character";
985 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000986 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000987 }
988 else {
989 *p++ = ch;
990 s++;
991 }
992 continue;
993 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000994 outpos = p-PyUnicode_AS_UNICODE(unicode);
995 endinpos = s-starts;
996 if (unicode_decode_call_errorhandler(
997 errors, &errorHandler,
998 "utf7", errmsg,
999 starts, size, &startinpos, &endinpos, &exc, &s,
1000 (PyObject **)&unicode, &outpos, &p))
1001 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 }
1003
1004 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001005 outpos = p-PyUnicode_AS_UNICODE(unicode);
1006 endinpos = size;
1007 if (unicode_decode_call_errorhandler(
1008 errors, &errorHandler,
1009 "utf7", "unterminated shift sequence",
1010 starts, size, &startinpos, &endinpos, &exc, &s,
1011 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001012 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001013 if (s < e)
1014 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001015 }
1016
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001017 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001018 goto onError;
1019
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001020 Py_XDECREF(errorHandler);
1021 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001022 return (PyObject *)unicode;
1023
1024onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001025 Py_XDECREF(errorHandler);
1026 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001027 Py_DECREF(unicode);
1028 return NULL;
1029}
1030
1031
1032PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1033 int size,
1034 int encodeSetO,
1035 int encodeWhiteSpace,
1036 const char *errors)
1037{
1038 PyObject *v;
1039 /* It might be possible to tighten this worst case */
1040 unsigned int cbAllocated = 5 * size;
1041 int inShift = 0;
1042 int i = 0;
1043 unsigned int bitsleft = 0;
1044 unsigned long charsleft = 0;
1045 char * out;
1046 char * start;
1047
1048 if (size == 0)
1049 return PyString_FromStringAndSize(NULL, 0);
1050
1051 v = PyString_FromStringAndSize(NULL, cbAllocated);
1052 if (v == NULL)
1053 return NULL;
1054
1055 start = out = PyString_AS_STRING(v);
1056 for (;i < size; ++i) {
1057 Py_UNICODE ch = s[i];
1058
1059 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001060 if (ch == '+') {
1061 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001062 *out++ = '-';
1063 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1064 charsleft = ch;
1065 bitsleft = 16;
1066 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001067 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001069 } else {
1070 *out++ = (char) ch;
1071 }
1072 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1074 *out++ = B64(charsleft << (6-bitsleft));
1075 charsleft = 0;
1076 bitsleft = 0;
1077 /* Characters not in the BASE64 set implicitly unshift the sequence
1078 so no '-' is required, except if the character is itself a '-' */
1079 if (B64CHAR(ch) || ch == '-') {
1080 *out++ = '-';
1081 }
1082 inShift = 0;
1083 *out++ = (char) ch;
1084 } else {
1085 bitsleft += 16;
1086 charsleft = (charsleft << 16) | ch;
1087 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1088
1089 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001090 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001091 or '-' then the shift sequence will be terminated implicitly and we
1092 don't have to insert a '-'. */
1093
1094 if (bitsleft == 0) {
1095 if (i + 1 < size) {
1096 Py_UNICODE ch2 = s[i+1];
1097
1098 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001099
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001100 } else if (B64CHAR(ch2) || ch2 == '-') {
1101 *out++ = '-';
1102 inShift = 0;
1103 } else {
1104 inShift = 0;
1105 }
1106
1107 }
1108 else {
1109 *out++ = '-';
1110 inShift = 0;
1111 }
1112 }
Tim Petersced69f82003-09-16 20:30:58 +00001113 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001114 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001115 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001116 if (bitsleft) {
1117 *out++= B64(charsleft << (6-bitsleft) );
1118 *out++ = '-';
1119 }
1120
Tim Peters5de98422002-04-27 18:44:32 +00001121 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001122 return v;
1123}
1124
1125#undef SPECIAL
1126#undef B64
1127#undef B64CHAR
1128#undef UB64
1129#undef ENCODE
1130#undef DECODE
1131
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132/* --- UTF-8 Codec -------------------------------------------------------- */
1133
Tim Petersced69f82003-09-16 20:30:58 +00001134static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135char utf8_code_length[256] = {
1136 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1137 illegal prefix. see RFC 2279 for details */
1138 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1139 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1140 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1141 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1142 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1143 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1144 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1145 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1146 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1147 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1148 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1149 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1150 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1151 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1152 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1153 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1154};
1155
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156PyObject *PyUnicode_DecodeUTF8(const char *s,
1157 int size,
1158 const char *errors)
1159{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001160 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001162 int startinpos;
1163 int endinpos;
1164 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 const char *e;
1166 PyUnicodeObject *unicode;
1167 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001168 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001169 PyObject *errorHandler = NULL;
1170 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171
1172 /* Note: size will always be longer than the resulting Unicode
1173 character count */
1174 unicode = _PyUnicode_New(size);
1175 if (!unicode)
1176 return NULL;
1177 if (size == 0)
1178 return (PyObject *)unicode;
1179
1180 /* Unpack UTF-8 encoded data */
1181 p = unicode->str;
1182 e = s + size;
1183
1184 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001185 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186
1187 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001188 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 s++;
1190 continue;
1191 }
1192
1193 n = utf8_code_length[ch];
1194
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001195 if (s + n > e) {
1196 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001197 startinpos = s-starts;
1198 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001199 goto utf8Error;
1200 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201
1202 switch (n) {
1203
1204 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001205 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001206 startinpos = s-starts;
1207 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001208 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209
1210 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001211 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001212 startinpos = s-starts;
1213 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001214 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215
1216 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001217 if ((s[1] & 0xc0) != 0x80) {
1218 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001219 startinpos = s-starts;
1220 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001221 goto utf8Error;
1222 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001224 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001225 startinpos = s-starts;
1226 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001227 errmsg = "illegal encoding";
1228 goto utf8Error;
1229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001231 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232 break;
1233
1234 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001235 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001236 (s[2] & 0xc0) != 0x80) {
1237 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001238 startinpos = s-starts;
1239 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001240 goto utf8Error;
1241 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001243 if (ch < 0x0800) {
1244 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001245 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001246
1247 XXX For wide builds (UCS-4) we should probably try
1248 to recombine the surrogates into a single code
1249 unit.
1250 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001251 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001252 startinpos = s-starts;
1253 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001254 goto utf8Error;
1255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001257 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001258 break;
1259
1260 case 4:
1261 if ((s[1] & 0xc0) != 0x80 ||
1262 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001263 (s[3] & 0xc0) != 0x80) {
1264 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001265 startinpos = s-starts;
1266 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001267 goto utf8Error;
1268 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001269 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1270 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1271 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001272 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001273 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001274 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001275 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001276 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001277 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001278 startinpos = s-starts;
1279 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001280 goto utf8Error;
1281 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001282#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001283 *p++ = (Py_UNICODE)ch;
1284#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001285 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001286
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001287 /* translate from 10000..10FFFF to 0..FFFF */
1288 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001289
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001290 /* high surrogate = top 10 bits added to D800 */
1291 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001292
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001293 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001294 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001295#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296 break;
1297
1298 default:
1299 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001300 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001301 startinpos = s-starts;
1302 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001303 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 }
1305 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001306 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001307
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001308 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001309 outpos = p-PyUnicode_AS_UNICODE(unicode);
1310 if (unicode_decode_call_errorhandler(
1311 errors, &errorHandler,
1312 "utf8", errmsg,
1313 starts, size, &startinpos, &endinpos, &exc, &s,
1314 (PyObject **)&unicode, &outpos, &p))
1315 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316 }
1317
1318 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001319 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001320 goto onError;
1321
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001322 Py_XDECREF(errorHandler);
1323 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001324 return (PyObject *)unicode;
1325
1326onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001327 Py_XDECREF(errorHandler);
1328 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 Py_DECREF(unicode);
1330 return NULL;
1331}
1332
Tim Peters602f7402002-04-27 18:03:26 +00001333/* Allocation strategy: if the string is short, convert into a stack buffer
1334 and allocate exactly as much space needed at the end. Else allocate the
1335 maximum possible needed (4 result bytes per Unicode character), and return
1336 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001337*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001338PyObject *
1339PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1340 int size,
1341 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001342{
Tim Peters602f7402002-04-27 18:03:26 +00001343#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001344
Tim Peters602f7402002-04-27 18:03:26 +00001345 int i; /* index into s of next input byte */
1346 PyObject *v; /* result string object */
1347 char *p; /* next free byte in output buffer */
1348 int nallocated; /* number of result bytes allocated */
1349 int nneeded; /* number of result bytes needed */
1350 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001351
Tim Peters602f7402002-04-27 18:03:26 +00001352 assert(s != NULL);
1353 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354
Tim Peters602f7402002-04-27 18:03:26 +00001355 if (size <= MAX_SHORT_UNICHARS) {
1356 /* Write into the stack buffer; nallocated can't overflow.
1357 * At the end, we'll allocate exactly as much heap space as it
1358 * turns out we need.
1359 */
1360 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1361 v = NULL; /* will allocate after we're done */
1362 p = stackbuf;
1363 }
1364 else {
1365 /* Overallocate on the heap, and give the excess back at the end. */
1366 nallocated = size * 4;
1367 if (nallocated / 4 != size) /* overflow! */
1368 return PyErr_NoMemory();
1369 v = PyString_FromStringAndSize(NULL, nallocated);
1370 if (v == NULL)
1371 return NULL;
1372 p = PyString_AS_STRING(v);
1373 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001374
Tim Peters602f7402002-04-27 18:03:26 +00001375 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001376 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001377
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001378 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001379 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001381
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001383 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001384 *p++ = (char)(0xc0 | (ch >> 6));
1385 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001386 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001387 else {
Tim Peters602f7402002-04-27 18:03:26 +00001388 /* Encode UCS2 Unicode ordinals */
1389 if (ch < 0x10000) {
1390 /* Special case: check for high surrogate */
1391 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1392 Py_UCS4 ch2 = s[i];
1393 /* Check for low surrogate and combine the two to
1394 form a UCS4 value */
1395 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001396 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001397 i++;
1398 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001399 }
Tim Peters602f7402002-04-27 18:03:26 +00001400 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001401 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001402 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001403 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1404 *p++ = (char)(0x80 | (ch & 0x3f));
1405 continue;
1406 }
1407encodeUCS4:
1408 /* Encode UCS4 Unicode ordinals */
1409 *p++ = (char)(0xf0 | (ch >> 18));
1410 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1411 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1412 *p++ = (char)(0x80 | (ch & 0x3f));
1413 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001414 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001415
Tim Peters602f7402002-04-27 18:03:26 +00001416 if (v == NULL) {
1417 /* This was stack allocated. */
1418 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1419 assert(nneeded <= nallocated);
1420 v = PyString_FromStringAndSize(stackbuf, nneeded);
1421 }
1422 else {
1423 /* Cut back to size actually needed. */
1424 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1425 assert(nneeded <= nallocated);
1426 _PyString_Resize(&v, nneeded);
1427 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001428 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001429
Tim Peters602f7402002-04-27 18:03:26 +00001430#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001431}
1432
Guido van Rossumd57fd912000-03-10 22:53:23 +00001433PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1434{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435 if (!PyUnicode_Check(unicode)) {
1436 PyErr_BadArgument();
1437 return NULL;
1438 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001439 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1440 PyUnicode_GET_SIZE(unicode),
1441 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001442}
1443
1444/* --- UTF-16 Codec ------------------------------------------------------- */
1445
Tim Peters772747b2001-08-09 22:21:55 +00001446PyObject *
1447PyUnicode_DecodeUTF16(const char *s,
1448 int size,
1449 const char *errors,
1450 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001451{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001452 const char *starts = s;
1453 int startinpos;
1454 int endinpos;
1455 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001456 PyUnicodeObject *unicode;
1457 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001458 const unsigned char *q, *e;
1459 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001460 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001461 /* Offsets from q for retrieving byte pairs in the right order. */
1462#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1463 int ihi = 1, ilo = 0;
1464#else
1465 int ihi = 0, ilo = 1;
1466#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001467 PyObject *errorHandler = NULL;
1468 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469
1470 /* Note: size will always be longer than the resulting Unicode
1471 character count */
1472 unicode = _PyUnicode_New(size);
1473 if (!unicode)
1474 return NULL;
1475 if (size == 0)
1476 return (PyObject *)unicode;
1477
1478 /* Unpack UTF-16 encoded data */
1479 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001480 q = (unsigned char *)s;
1481 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001482
1483 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001484 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001485
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001486 /* Check for BOM marks (U+FEFF) in the input and adjust current
1487 byte order setting accordingly. In native mode, the leading BOM
1488 mark is skipped, in all other modes, it is copied to the output
1489 stream as-is (giving a ZWNBSP character). */
1490 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001491 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001492#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001493 if (bom == 0xFEFF) {
1494 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001495 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001496 }
1497 else if (bom == 0xFFFE) {
1498 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001499 bo = 1;
1500 }
Tim Petersced69f82003-09-16 20:30:58 +00001501#else
Tim Peters772747b2001-08-09 22:21:55 +00001502 if (bom == 0xFEFF) {
1503 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001504 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001505 }
1506 else if (bom == 0xFFFE) {
1507 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001508 bo = -1;
1509 }
1510#endif
1511 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001512
Tim Peters772747b2001-08-09 22:21:55 +00001513 if (bo == -1) {
1514 /* force LE */
1515 ihi = 1;
1516 ilo = 0;
1517 }
1518 else if (bo == 1) {
1519 /* force BE */
1520 ihi = 0;
1521 ilo = 1;
1522 }
1523
1524 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 Py_UNICODE ch;
1526 /* remaing bytes at the end? (size should be even) */
1527 if (e-q<2) {
1528 errmsg = "truncated data";
1529 startinpos = ((const char *)q)-starts;
1530 endinpos = ((const char *)e)-starts;
1531 goto utf16Error;
1532 /* The remaining input chars are ignored if the callback
1533 chooses to skip the input */
1534 }
1535 ch = (q[ihi] << 8) | q[ilo];
1536
Tim Peters772747b2001-08-09 22:21:55 +00001537 q += 2;
1538
Guido van Rossumd57fd912000-03-10 22:53:23 +00001539 if (ch < 0xD800 || ch > 0xDFFF) {
1540 *p++ = ch;
1541 continue;
1542 }
1543
1544 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001545 if (q >= e) {
1546 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001547 startinpos = (((const char *)q)-2)-starts;
1548 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001549 goto utf16Error;
1550 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001551 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001552 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1553 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001554 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001555#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001556 *p++ = ch;
1557 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001558#else
1559 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001560#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001561 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001562 }
1563 else {
1564 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001565 startinpos = (((const char *)q)-4)-starts;
1566 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001567 goto utf16Error;
1568 }
1569
Guido van Rossumd57fd912000-03-10 22:53:23 +00001570 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001571 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001572 startinpos = (((const char *)q)-2)-starts;
1573 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001574 /* Fall through to report the error */
1575
1576 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001577 outpos = p-PyUnicode_AS_UNICODE(unicode);
1578 if (unicode_decode_call_errorhandler(
1579 errors, &errorHandler,
1580 "utf16", errmsg,
1581 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1582 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001583 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001584 }
1585
1586 if (byteorder)
1587 *byteorder = bo;
1588
1589 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001590 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001591 goto onError;
1592
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001593 Py_XDECREF(errorHandler);
1594 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001595 return (PyObject *)unicode;
1596
1597onError:
1598 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001599 Py_XDECREF(errorHandler);
1600 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001601 return NULL;
1602}
1603
Tim Peters772747b2001-08-09 22:21:55 +00001604PyObject *
1605PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1606 int size,
1607 const char *errors,
1608 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001609{
1610 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001611 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001612#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001613 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001614#else
1615 const int pairs = 0;
1616#endif
Tim Peters772747b2001-08-09 22:21:55 +00001617 /* Offsets from p for storing byte pairs in the right order. */
1618#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1619 int ihi = 1, ilo = 0;
1620#else
1621 int ihi = 0, ilo = 1;
1622#endif
1623
1624#define STORECHAR(CH) \
1625 do { \
1626 p[ihi] = ((CH) >> 8) & 0xff; \
1627 p[ilo] = (CH) & 0xff; \
1628 p += 2; \
1629 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001631#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001632 for (i = pairs = 0; i < size; i++)
1633 if (s[i] >= 0x10000)
1634 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001635#endif
Tim Petersced69f82003-09-16 20:30:58 +00001636 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001637 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001638 if (v == NULL)
1639 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001640
Tim Peters772747b2001-08-09 22:21:55 +00001641 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001642 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001643 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001644 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001645 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001646
1647 if (byteorder == -1) {
1648 /* force LE */
1649 ihi = 1;
1650 ilo = 0;
1651 }
1652 else if (byteorder == 1) {
1653 /* force BE */
1654 ihi = 0;
1655 ilo = 1;
1656 }
1657
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001658 while (size-- > 0) {
1659 Py_UNICODE ch = *s++;
1660 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001661#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001662 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001663 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1664 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001666#endif
Tim Peters772747b2001-08-09 22:21:55 +00001667 STORECHAR(ch);
1668 if (ch2)
1669 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001670 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001672#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673}
1674
1675PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1676{
1677 if (!PyUnicode_Check(unicode)) {
1678 PyErr_BadArgument();
1679 return NULL;
1680 }
1681 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1682 PyUnicode_GET_SIZE(unicode),
1683 NULL,
1684 0);
1685}
1686
1687/* --- Unicode Escape Codec ----------------------------------------------- */
1688
Fredrik Lundh06d12682001-01-24 07:59:11 +00001689static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001690
Guido van Rossumd57fd912000-03-10 22:53:23 +00001691PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1692 int size,
1693 const char *errors)
1694{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001695 const char *starts = s;
1696 int startinpos;
1697 int endinpos;
1698 int outpos;
1699 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001701 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001703 char* message;
1704 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001705 PyObject *errorHandler = NULL;
1706 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001707
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708 /* Escaped strings will always be longer than the resulting
1709 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001710 length after conversion to the true value.
1711 (but if the error callback returns a long replacement string
1712 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713 v = _PyUnicode_New(size);
1714 if (v == NULL)
1715 goto onError;
1716 if (size == 0)
1717 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001718
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001719 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001721
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 while (s < end) {
1723 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001724 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001725 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726
1727 /* Non-escape characters are interpreted as Unicode ordinals */
1728 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001729 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730 continue;
1731 }
1732
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001733 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734 /* \ - Escapes */
1735 s++;
1736 switch (*s++) {
1737
1738 /* \x escapes */
1739 case '\n': break;
1740 case '\\': *p++ = '\\'; break;
1741 case '\'': *p++ = '\''; break;
1742 case '\"': *p++ = '\"'; break;
1743 case 'b': *p++ = '\b'; break;
1744 case 'f': *p++ = '\014'; break; /* FF */
1745 case 't': *p++ = '\t'; break;
1746 case 'n': *p++ = '\n'; break;
1747 case 'r': *p++ = '\r'; break;
1748 case 'v': *p++ = '\013'; break; /* VT */
1749 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1750
1751 /* \OOO (octal) escapes */
1752 case '0': case '1': case '2': case '3':
1753 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001754 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001756 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001758 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001760 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001761 break;
1762
Fredrik Lundhccc74732001-02-18 22:13:49 +00001763 /* hex escapes */
1764 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001765 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001766 digits = 2;
1767 message = "truncated \\xXX escape";
1768 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001769
Fredrik Lundhccc74732001-02-18 22:13:49 +00001770 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001772 digits = 4;
1773 message = "truncated \\uXXXX escape";
1774 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001775
Fredrik Lundhccc74732001-02-18 22:13:49 +00001776 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001777 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001778 digits = 8;
1779 message = "truncated \\UXXXXXXXX escape";
1780 hexescape:
1781 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001782 outpos = p-PyUnicode_AS_UNICODE(v);
1783 if (s+digits>end) {
1784 endinpos = size;
1785 if (unicode_decode_call_errorhandler(
1786 errors, &errorHandler,
1787 "unicodeescape", "end of string in escape sequence",
1788 starts, size, &startinpos, &endinpos, &exc, &s,
1789 (PyObject **)&v, &outpos, &p))
1790 goto onError;
1791 goto nextByte;
1792 }
1793 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001794 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001795 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001796 endinpos = (s+i+1)-starts;
1797 if (unicode_decode_call_errorhandler(
1798 errors, &errorHandler,
1799 "unicodeescape", message,
1800 starts, size, &startinpos, &endinpos, &exc, &s,
1801 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001802 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001804 }
1805 chr = (chr<<4) & ~0xF;
1806 if (c >= '0' && c <= '9')
1807 chr += c - '0';
1808 else if (c >= 'a' && c <= 'f')
1809 chr += 10 + c - 'a';
1810 else
1811 chr += 10 + c - 'A';
1812 }
1813 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001814 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001815 /* _decoding_error will have already written into the
1816 target buffer. */
1817 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001818 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001819 /* when we get here, chr is a 32-bit unicode character */
1820 if (chr <= 0xffff)
1821 /* UCS-2 character */
1822 *p++ = (Py_UNICODE) chr;
1823 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001824 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001825 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001826#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001827 *p++ = chr;
1828#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001829 chr -= 0x10000L;
1830 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001831 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001832#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001833 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001834 endinpos = s-starts;
1835 outpos = p-PyUnicode_AS_UNICODE(v);
1836 if (unicode_decode_call_errorhandler(
1837 errors, &errorHandler,
1838 "unicodeescape", "illegal Unicode character",
1839 starts, size, &startinpos, &endinpos, &exc, &s,
1840 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001841 goto onError;
1842 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001843 break;
1844
1845 /* \N{name} */
1846 case 'N':
1847 message = "malformed \\N character escape";
1848 if (ucnhash_CAPI == NULL) {
1849 /* load the unicode data module */
1850 PyObject *m, *v;
1851 m = PyImport_ImportModule("unicodedata");
1852 if (m == NULL)
1853 goto ucnhashError;
1854 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1855 Py_DECREF(m);
1856 if (v == NULL)
1857 goto ucnhashError;
1858 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1859 Py_DECREF(v);
1860 if (ucnhash_CAPI == NULL)
1861 goto ucnhashError;
1862 }
1863 if (*s == '{') {
1864 const char *start = s+1;
1865 /* look for the closing brace */
1866 while (*s != '}' && s < end)
1867 s++;
1868 if (s > start && s < end && *s == '}') {
1869 /* found a name. look it up in the unicode database */
1870 message = "unknown Unicode character name";
1871 s++;
1872 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1873 goto store;
1874 }
1875 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001876 endinpos = s-starts;
1877 outpos = p-PyUnicode_AS_UNICODE(v);
1878 if (unicode_decode_call_errorhandler(
1879 errors, &errorHandler,
1880 "unicodeescape", message,
1881 starts, size, &startinpos, &endinpos, &exc, &s,
1882 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001883 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001884 break;
1885
1886 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001887 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001888 message = "\\ at end of string";
1889 s--;
1890 endinpos = s-starts;
1891 outpos = p-PyUnicode_AS_UNICODE(v);
1892 if (unicode_decode_call_errorhandler(
1893 errors, &errorHandler,
1894 "unicodeescape", message,
1895 starts, size, &startinpos, &endinpos, &exc, &s,
1896 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001897 goto onError;
1898 }
1899 else {
1900 *p++ = '\\';
1901 *p++ = (unsigned char)s[-1];
1902 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001903 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001905 nextByte:
1906 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001907 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001908 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001909 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001910 Py_XDECREF(errorHandler);
1911 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001912 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001913
Fredrik Lundhccc74732001-02-18 22:13:49 +00001914ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001915 PyErr_SetString(
1916 PyExc_UnicodeError,
1917 "\\N escapes not supported (can't load unicodedata module)"
1918 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001919 Py_XDECREF(errorHandler);
1920 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001921 return NULL;
1922
Fredrik Lundhccc74732001-02-18 22:13:49 +00001923onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001924 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001925 Py_XDECREF(errorHandler);
1926 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927 return NULL;
1928}
1929
1930/* Return a Unicode-Escape string version of the Unicode object.
1931
1932 If quotes is true, the string is enclosed in u"" or u'' quotes as
1933 appropriate.
1934
1935*/
1936
Barry Warsaw51ac5802000-03-20 16:36:48 +00001937static const Py_UNICODE *findchar(const Py_UNICODE *s,
1938 int size,
1939 Py_UNICODE ch);
1940
Guido van Rossumd57fd912000-03-10 22:53:23 +00001941static
1942PyObject *unicodeescape_string(const Py_UNICODE *s,
1943 int size,
1944 int quotes)
1945{
1946 PyObject *repr;
1947 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001948
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001949 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001950
1951 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1952 if (repr == NULL)
1953 return NULL;
1954
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001955 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001956
1957 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001958 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001959 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001960 !findchar(s, size, '"')) ? '"' : '\'';
1961 }
1962 while (size-- > 0) {
1963 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001964
Guido van Rossumd57fd912000-03-10 22:53:23 +00001965 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001966 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001967 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968 *p++ = '\\';
1969 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001970 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001971 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001972
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001973#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001974 /* Map 21-bit characters to '\U00xxxxxx' */
1975 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001976 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00001977
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001978 /* Resize the string if necessary */
1979 if (offset + 12 > PyString_GET_SIZE(repr)) {
1980 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001981 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001982 p = PyString_AS_STRING(repr) + offset;
1983 }
1984
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001985 *p++ = '\\';
1986 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001987 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1988 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1989 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1990 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1991 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1992 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1993 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001994 *p++ = hexdigit[ch & 0x0000000F];
1995 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001996 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001997#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001998 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1999 else if (ch >= 0xD800 && ch < 0xDC00) {
2000 Py_UNICODE ch2;
2001 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002002
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002003 ch2 = *s++;
2004 size--;
2005 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2006 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2007 *p++ = '\\';
2008 *p++ = 'U';
2009 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2010 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2011 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2012 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2013 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2014 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2015 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2016 *p++ = hexdigit[ucs & 0x0000000F];
2017 continue;
2018 }
2019 /* Fall through: isolated surrogates are copied as-is */
2020 s--;
2021 size++;
2022 }
2023
Guido van Rossumd57fd912000-03-10 22:53:23 +00002024 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002025 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002026 *p++ = '\\';
2027 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002028 *p++ = hexdigit[(ch >> 12) & 0x000F];
2029 *p++ = hexdigit[(ch >> 8) & 0x000F];
2030 *p++ = hexdigit[(ch >> 4) & 0x000F];
2031 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002032 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002033
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002034 /* Map special whitespace to '\t', \n', '\r' */
2035 else if (ch == '\t') {
2036 *p++ = '\\';
2037 *p++ = 't';
2038 }
2039 else if (ch == '\n') {
2040 *p++ = '\\';
2041 *p++ = 'n';
2042 }
2043 else if (ch == '\r') {
2044 *p++ = '\\';
2045 *p++ = 'r';
2046 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002047
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002048 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002049 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002051 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002052 *p++ = hexdigit[(ch >> 4) & 0x000F];
2053 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002054 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002055
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056 /* Copy everything else as-is */
2057 else
2058 *p++ = (char) ch;
2059 }
2060 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002061 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002062
2063 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002064 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 return repr;
2066}
2067
2068PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2069 int size)
2070{
2071 return unicodeescape_string(s, size, 0);
2072}
2073
2074PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2075{
2076 if (!PyUnicode_Check(unicode)) {
2077 PyErr_BadArgument();
2078 return NULL;
2079 }
2080 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2081 PyUnicode_GET_SIZE(unicode));
2082}
2083
2084/* --- Raw Unicode Escape Codec ------------------------------------------- */
2085
2086PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2087 int size,
2088 const char *errors)
2089{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002090 const char *starts = s;
2091 int startinpos;
2092 int endinpos;
2093 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002095 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096 const char *end;
2097 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002098 PyObject *errorHandler = NULL;
2099 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002100
Guido van Rossumd57fd912000-03-10 22:53:23 +00002101 /* Escaped strings will always be longer than the resulting
2102 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002103 length after conversion to the true value. (But decoding error
2104 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002105 v = _PyUnicode_New(size);
2106 if (v == NULL)
2107 goto onError;
2108 if (size == 0)
2109 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002110 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111 end = s + size;
2112 while (s < end) {
2113 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002114 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002115 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002116 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117
2118 /* Non-escape characters are interpreted as Unicode ordinals */
2119 if (*s != '\\') {
2120 *p++ = (unsigned char)*s++;
2121 continue;
2122 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002123 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124
2125 /* \u-escapes are only interpreted iff the number of leading
2126 backslashes if odd */
2127 bs = s;
2128 for (;s < end;) {
2129 if (*s != '\\')
2130 break;
2131 *p++ = (unsigned char)*s++;
2132 }
2133 if (((s - bs) & 1) == 0 ||
2134 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002135 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136 continue;
2137 }
2138 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002139 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 s++;
2141
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002142 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002143 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002144 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002145 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002147 endinpos = s-starts;
2148 if (unicode_decode_call_errorhandler(
2149 errors, &errorHandler,
2150 "rawunicodeescape", "truncated \\uXXXX",
2151 starts, size, &startinpos, &endinpos, &exc, &s,
2152 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002154 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002155 }
2156 x = (x<<4) & ~0xF;
2157 if (c >= '0' && c <= '9')
2158 x += c - '0';
2159 else if (c >= 'a' && c <= 'f')
2160 x += 10 + c - 'a';
2161 else
2162 x += 10 + c - 'A';
2163 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002164#ifndef Py_UNICODE_WIDE
2165 if (x > 0x10000) {
2166 if (unicode_decode_call_errorhandler(
2167 errors, &errorHandler,
2168 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2169 starts, size, &startinpos, &endinpos, &exc, &s,
2170 (PyObject **)&v, &outpos, &p))
2171 goto onError;
2172 }
2173#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002174 *p++ = x;
2175 nextByte:
2176 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002178 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002179 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002180 Py_XDECREF(errorHandler);
2181 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002183
Guido van Rossumd57fd912000-03-10 22:53:23 +00002184 onError:
2185 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002186 Py_XDECREF(errorHandler);
2187 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188 return NULL;
2189}
2190
2191PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2192 int size)
2193{
2194 PyObject *repr;
2195 char *p;
2196 char *q;
2197
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002198 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002200#ifdef Py_UNICODE_WIDE
2201 repr = PyString_FromStringAndSize(NULL, 10 * size);
2202#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002204#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205 if (repr == NULL)
2206 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002207 if (size == 0)
2208 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209
2210 p = q = PyString_AS_STRING(repr);
2211 while (size-- > 0) {
2212 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002213#ifdef Py_UNICODE_WIDE
2214 /* Map 32-bit characters to '\Uxxxxxxxx' */
2215 if (ch >= 0x10000) {
2216 *p++ = '\\';
2217 *p++ = 'U';
2218 *p++ = hexdigit[(ch >> 28) & 0xf];
2219 *p++ = hexdigit[(ch >> 24) & 0xf];
2220 *p++ = hexdigit[(ch >> 20) & 0xf];
2221 *p++ = hexdigit[(ch >> 16) & 0xf];
2222 *p++ = hexdigit[(ch >> 12) & 0xf];
2223 *p++ = hexdigit[(ch >> 8) & 0xf];
2224 *p++ = hexdigit[(ch >> 4) & 0xf];
2225 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002226 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002227 else
2228#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002229 /* Map 16-bit characters to '\uxxxx' */
2230 if (ch >= 256) {
2231 *p++ = '\\';
2232 *p++ = 'u';
2233 *p++ = hexdigit[(ch >> 12) & 0xf];
2234 *p++ = hexdigit[(ch >> 8) & 0xf];
2235 *p++ = hexdigit[(ch >> 4) & 0xf];
2236 *p++ = hexdigit[ch & 15];
2237 }
2238 /* Copy everything else as-is */
2239 else
2240 *p++ = (char) ch;
2241 }
2242 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002243 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244 return repr;
2245}
2246
2247PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2248{
2249 if (!PyUnicode_Check(unicode)) {
2250 PyErr_BadArgument();
2251 return NULL;
2252 }
2253 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2254 PyUnicode_GET_SIZE(unicode));
2255}
2256
2257/* --- Latin-1 Codec ------------------------------------------------------ */
2258
2259PyObject *PyUnicode_DecodeLatin1(const char *s,
2260 int size,
2261 const char *errors)
2262{
2263 PyUnicodeObject *v;
2264 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002265
Guido van Rossumd57fd912000-03-10 22:53:23 +00002266 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002267 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002268 Py_UNICODE r = *(unsigned char*)s;
2269 return PyUnicode_FromUnicode(&r, 1);
2270 }
2271
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272 v = _PyUnicode_New(size);
2273 if (v == NULL)
2274 goto onError;
2275 if (size == 0)
2276 return (PyObject *)v;
2277 p = PyUnicode_AS_UNICODE(v);
2278 while (size-- > 0)
2279 *p++ = (unsigned char)*s++;
2280 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002281
Guido van Rossumd57fd912000-03-10 22:53:23 +00002282 onError:
2283 Py_XDECREF(v);
2284 return NULL;
2285}
2286
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002287/* create or adjust a UnicodeEncodeError */
2288static void make_encode_exception(PyObject **exceptionObject,
2289 const char *encoding,
2290 const Py_UNICODE *unicode, int size,
2291 int startpos, int endpos,
2292 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002293{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002294 if (*exceptionObject == NULL) {
2295 *exceptionObject = PyUnicodeEncodeError_Create(
2296 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297 }
2298 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002299 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2300 goto onError;
2301 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2302 goto onError;
2303 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2304 goto onError;
2305 return;
2306 onError:
2307 Py_DECREF(*exceptionObject);
2308 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002309 }
2310}
2311
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002312/* raises a UnicodeEncodeError */
2313static void raise_encode_exception(PyObject **exceptionObject,
2314 const char *encoding,
2315 const Py_UNICODE *unicode, int size,
2316 int startpos, int endpos,
2317 const char *reason)
2318{
2319 make_encode_exception(exceptionObject,
2320 encoding, unicode, size, startpos, endpos, reason);
2321 if (*exceptionObject != NULL)
2322 PyCodec_StrictErrors(*exceptionObject);
2323}
2324
2325/* error handling callback helper:
2326 build arguments, call the callback and check the arguments,
2327 put the result into newpos and return the replacement string, which
2328 has to be freed by the caller */
2329static PyObject *unicode_encode_call_errorhandler(const char *errors,
2330 PyObject **errorHandler,
2331 const char *encoding, const char *reason,
2332 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2333 int startpos, int endpos,
2334 int *newpos)
2335{
2336 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2337
2338 PyObject *restuple;
2339 PyObject *resunicode;
2340
2341 if (*errorHandler == NULL) {
2342 *errorHandler = PyCodec_LookupError(errors);
2343 if (*errorHandler == NULL)
2344 return NULL;
2345 }
2346
2347 make_encode_exception(exceptionObject,
2348 encoding, unicode, size, startpos, endpos, reason);
2349 if (*exceptionObject == NULL)
2350 return NULL;
2351
2352 restuple = PyObject_CallFunctionObjArgs(
2353 *errorHandler, *exceptionObject, NULL);
2354 if (restuple == NULL)
2355 return NULL;
2356 if (!PyTuple_Check(restuple)) {
2357 PyErr_Format(PyExc_TypeError, &argparse[4]);
2358 Py_DECREF(restuple);
2359 return NULL;
2360 }
2361 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2362 &resunicode, newpos)) {
2363 Py_DECREF(restuple);
2364 return NULL;
2365 }
2366 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002367 *newpos = size+*newpos;
2368 if (*newpos<0 || *newpos>size) {
2369 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2370 Py_DECREF(restuple);
2371 return NULL;
2372 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002373 Py_INCREF(resunicode);
2374 Py_DECREF(restuple);
2375 return resunicode;
2376}
2377
2378static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2379 int size,
2380 const char *errors,
2381 int limit)
2382{
2383 /* output object */
2384 PyObject *res;
2385 /* pointers to the beginning and end+1 of input */
2386 const Py_UNICODE *startp = p;
2387 const Py_UNICODE *endp = p + size;
2388 /* pointer to the beginning of the unencodable characters */
2389 /* const Py_UNICODE *badp = NULL; */
2390 /* pointer into the output */
2391 char *str;
2392 /* current output position */
2393 int respos = 0;
2394 int ressize;
2395 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2396 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2397 PyObject *errorHandler = NULL;
2398 PyObject *exc = NULL;
2399 /* the following variable is used for caching string comparisons
2400 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2401 int known_errorHandler = -1;
2402
2403 /* allocate enough for a simple encoding without
2404 replacements, if we need more, we'll resize */
2405 res = PyString_FromStringAndSize(NULL, size);
2406 if (res == NULL)
2407 goto onError;
2408 if (size == 0)
2409 return res;
2410 str = PyString_AS_STRING(res);
2411 ressize = size;
2412
2413 while (p<endp) {
2414 Py_UNICODE c = *p;
2415
2416 /* can we encode this? */
2417 if (c<limit) {
2418 /* no overflow check, because we know that the space is enough */
2419 *str++ = (char)c;
2420 ++p;
2421 }
2422 else {
2423 int unicodepos = p-startp;
2424 int requiredsize;
2425 PyObject *repunicode;
2426 int repsize;
2427 int newpos;
2428 int respos;
2429 Py_UNICODE *uni2;
2430 /* startpos for collecting unencodable chars */
2431 const Py_UNICODE *collstart = p;
2432 const Py_UNICODE *collend = p;
2433 /* find all unecodable characters */
2434 while ((collend < endp) && ((*collend)>=limit))
2435 ++collend;
2436 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2437 if (known_errorHandler==-1) {
2438 if ((errors==NULL) || (!strcmp(errors, "strict")))
2439 known_errorHandler = 1;
2440 else if (!strcmp(errors, "replace"))
2441 known_errorHandler = 2;
2442 else if (!strcmp(errors, "ignore"))
2443 known_errorHandler = 3;
2444 else if (!strcmp(errors, "xmlcharrefreplace"))
2445 known_errorHandler = 4;
2446 else
2447 known_errorHandler = 0;
2448 }
2449 switch (known_errorHandler) {
2450 case 1: /* strict */
2451 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2452 goto onError;
2453 case 2: /* replace */
2454 while (collstart++<collend)
2455 *str++ = '?'; /* fall through */
2456 case 3: /* ignore */
2457 p = collend;
2458 break;
2459 case 4: /* xmlcharrefreplace */
2460 respos = str-PyString_AS_STRING(res);
2461 /* determine replacement size (temporarily (mis)uses p) */
2462 for (p = collstart, repsize = 0; p < collend; ++p) {
2463 if (*p<10)
2464 repsize += 2+1+1;
2465 else if (*p<100)
2466 repsize += 2+2+1;
2467 else if (*p<1000)
2468 repsize += 2+3+1;
2469 else if (*p<10000)
2470 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002471#ifndef Py_UNICODE_WIDE
2472 else
2473 repsize += 2+5+1;
2474#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002475 else if (*p<100000)
2476 repsize += 2+5+1;
2477 else if (*p<1000000)
2478 repsize += 2+6+1;
2479 else
2480 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002481#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002482 }
2483 requiredsize = respos+repsize+(endp-collend);
2484 if (requiredsize > ressize) {
2485 if (requiredsize<2*ressize)
2486 requiredsize = 2*ressize;
2487 if (_PyString_Resize(&res, requiredsize))
2488 goto onError;
2489 str = PyString_AS_STRING(res) + respos;
2490 ressize = requiredsize;
2491 }
2492 /* generate replacement (temporarily (mis)uses p) */
2493 for (p = collstart; p < collend; ++p) {
2494 str += sprintf(str, "&#%d;", (int)*p);
2495 }
2496 p = collend;
2497 break;
2498 default:
2499 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2500 encoding, reason, startp, size, &exc,
2501 collstart-startp, collend-startp, &newpos);
2502 if (repunicode == NULL)
2503 goto onError;
2504 /* need more space? (at least enough for what we
2505 have+the replacement+the rest of the string, so
2506 we won't have to check space for encodable characters) */
2507 respos = str-PyString_AS_STRING(res);
2508 repsize = PyUnicode_GET_SIZE(repunicode);
2509 requiredsize = respos+repsize+(endp-collend);
2510 if (requiredsize > ressize) {
2511 if (requiredsize<2*ressize)
2512 requiredsize = 2*ressize;
2513 if (_PyString_Resize(&res, requiredsize)) {
2514 Py_DECREF(repunicode);
2515 goto onError;
2516 }
2517 str = PyString_AS_STRING(res) + respos;
2518 ressize = requiredsize;
2519 }
2520 /* check if there is anything unencodable in the replacement
2521 and copy it to the output */
2522 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2523 c = *uni2;
2524 if (c >= limit) {
2525 raise_encode_exception(&exc, encoding, startp, size,
2526 unicodepos, unicodepos+1, reason);
2527 Py_DECREF(repunicode);
2528 goto onError;
2529 }
2530 *str = (char)c;
2531 }
2532 p = startp + newpos;
2533 Py_DECREF(repunicode);
2534 }
2535 }
2536 }
2537 /* Resize if we allocated to much */
2538 respos = str-PyString_AS_STRING(res);
2539 if (respos<ressize)
2540 /* If this falls res will be NULL */
2541 _PyString_Resize(&res, respos);
2542 Py_XDECREF(errorHandler);
2543 Py_XDECREF(exc);
2544 return res;
2545
2546 onError:
2547 Py_XDECREF(res);
2548 Py_XDECREF(errorHandler);
2549 Py_XDECREF(exc);
2550 return NULL;
2551}
2552
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2554 int size,
2555 const char *errors)
2556{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002557 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002558}
2559
2560PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2561{
2562 if (!PyUnicode_Check(unicode)) {
2563 PyErr_BadArgument();
2564 return NULL;
2565 }
2566 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2567 PyUnicode_GET_SIZE(unicode),
2568 NULL);
2569}
2570
2571/* --- 7-bit ASCII Codec -------------------------------------------------- */
2572
Guido van Rossumd57fd912000-03-10 22:53:23 +00002573PyObject *PyUnicode_DecodeASCII(const char *s,
2574 int size,
2575 const char *errors)
2576{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002577 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578 PyUnicodeObject *v;
2579 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002580 int startinpos;
2581 int endinpos;
2582 int outpos;
2583 const char *e;
2584 PyObject *errorHandler = NULL;
2585 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002586
Guido van Rossumd57fd912000-03-10 22:53:23 +00002587 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002588 if (size == 1 && *(unsigned char*)s < 128) {
2589 Py_UNICODE r = *(unsigned char*)s;
2590 return PyUnicode_FromUnicode(&r, 1);
2591 }
Tim Petersced69f82003-09-16 20:30:58 +00002592
Guido van Rossumd57fd912000-03-10 22:53:23 +00002593 v = _PyUnicode_New(size);
2594 if (v == NULL)
2595 goto onError;
2596 if (size == 0)
2597 return (PyObject *)v;
2598 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002599 e = s + size;
2600 while (s < e) {
2601 register unsigned char c = (unsigned char)*s;
2602 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002604 ++s;
2605 }
2606 else {
2607 startinpos = s-starts;
2608 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002609 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002610 if (unicode_decode_call_errorhandler(
2611 errors, &errorHandler,
2612 "ascii", "ordinal not in range(128)",
2613 starts, size, &startinpos, &endinpos, &exc, &s,
2614 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002616 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002618 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002619 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002620 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002621 Py_XDECREF(errorHandler);
2622 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002624
Guido van Rossumd57fd912000-03-10 22:53:23 +00002625 onError:
2626 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002627 Py_XDECREF(errorHandler);
2628 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 return NULL;
2630}
2631
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2633 int size,
2634 const char *errors)
2635{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002636 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002637}
2638
2639PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2640{
2641 if (!PyUnicode_Check(unicode)) {
2642 PyErr_BadArgument();
2643 return NULL;
2644 }
2645 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2646 PyUnicode_GET_SIZE(unicode),
2647 NULL);
2648}
2649
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002650#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002651
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002652/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002653
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002654PyObject *PyUnicode_DecodeMBCS(const char *s,
2655 int size,
2656 const char *errors)
2657{
2658 PyUnicodeObject *v;
2659 Py_UNICODE *p;
2660
2661 /* First get the size of the result */
2662 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002663 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002664 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2665
2666 v = _PyUnicode_New(usize);
2667 if (v == NULL)
2668 return NULL;
2669 if (usize == 0)
2670 return (PyObject *)v;
2671 p = PyUnicode_AS_UNICODE(v);
2672 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2673 Py_DECREF(v);
2674 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2675 }
2676
2677 return (PyObject *)v;
2678}
2679
2680PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2681 int size,
2682 const char *errors)
2683{
2684 PyObject *repr;
2685 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002686 DWORD mbcssize;
2687
2688 /* If there are no characters, bail now! */
2689 if (size==0)
2690 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002691
2692 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002693 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002694 if (mbcssize==0)
2695 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2696
2697 repr = PyString_FromStringAndSize(NULL, mbcssize);
2698 if (repr == NULL)
2699 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002700 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002701 return repr;
2702
2703 /* Do the conversion */
2704 s = PyString_AS_STRING(repr);
2705 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2706 Py_DECREF(repr);
2707 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2708 }
2709 return repr;
2710}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002711
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002712PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2713{
2714 if (!PyUnicode_Check(unicode)) {
2715 PyErr_BadArgument();
2716 return NULL;
2717 }
2718 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2719 PyUnicode_GET_SIZE(unicode),
2720 NULL);
2721}
2722
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002723#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002724
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725/* --- Character Mapping Codec -------------------------------------------- */
2726
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727PyObject *PyUnicode_DecodeCharmap(const char *s,
2728 int size,
2729 PyObject *mapping,
2730 const char *errors)
2731{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002732 const char *starts = s;
2733 int startinpos;
2734 int endinpos;
2735 int outpos;
2736 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737 PyUnicodeObject *v;
2738 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002739 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002740 PyObject *errorHandler = NULL;
2741 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002742
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743 /* Default to Latin-1 */
2744 if (mapping == NULL)
2745 return PyUnicode_DecodeLatin1(s, size, errors);
2746
2747 v = _PyUnicode_New(size);
2748 if (v == NULL)
2749 goto onError;
2750 if (size == 0)
2751 return (PyObject *)v;
2752 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002753 e = s + size;
2754 while (s < e) {
2755 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 PyObject *w, *x;
2757
2758 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2759 w = PyInt_FromLong((long)ch);
2760 if (w == NULL)
2761 goto onError;
2762 x = PyObject_GetItem(mapping, w);
2763 Py_DECREF(w);
2764 if (x == NULL) {
2765 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002766 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002768 x = Py_None;
2769 Py_INCREF(x);
2770 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002771 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772 }
2773
2774 /* Apply mapping */
2775 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002776 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777 if (value < 0 || value > 65535) {
2778 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002779 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 Py_DECREF(x);
2781 goto onError;
2782 }
2783 *p++ = (Py_UNICODE)value;
2784 }
2785 else if (x == Py_None) {
2786 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002787 outpos = p-PyUnicode_AS_UNICODE(v);
2788 startinpos = s-starts;
2789 endinpos = startinpos+1;
2790 if (unicode_decode_call_errorhandler(
2791 errors, &errorHandler,
2792 "charmap", "character maps to <undefined>",
2793 starts, size, &startinpos, &endinpos, &exc, &s,
2794 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795 Py_DECREF(x);
2796 goto onError;
2797 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002798 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 }
2800 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002801 int targetsize = PyUnicode_GET_SIZE(x);
2802
2803 if (targetsize == 1)
2804 /* 1-1 mapping */
2805 *p++ = *PyUnicode_AS_UNICODE(x);
2806
2807 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002809 if (targetsize > extrachars) {
2810 /* resize first */
2811 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2812 int needed = (targetsize - extrachars) + \
2813 (targetsize << 2);
2814 extrachars += needed;
Tim Petersced69f82003-09-16 20:30:58 +00002815 if (_PyUnicode_Resize(&v,
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002816 PyUnicode_GET_SIZE(v) + needed) < 0) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002817 Py_DECREF(x);
2818 goto onError;
2819 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002820 p = PyUnicode_AS_UNICODE(v) + oldpos;
2821 }
2822 Py_UNICODE_COPY(p,
2823 PyUnicode_AS_UNICODE(x),
2824 targetsize);
2825 p += targetsize;
2826 extrachars -= targetsize;
2827 }
2828 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829 }
2830 else {
2831 /* wrong return value */
2832 PyErr_SetString(PyExc_TypeError,
2833 "character mapping must return integer, None or unicode");
2834 Py_DECREF(x);
2835 goto onError;
2836 }
2837 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002838 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839 }
2840 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002841 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002842 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002843 Py_XDECREF(errorHandler);
2844 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002846
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002848 Py_XDECREF(errorHandler);
2849 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850 Py_XDECREF(v);
2851 return NULL;
2852}
2853
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002854/* Lookup the character ch in the mapping. If the character
2855 can't be found, Py_None is returned (or NULL, if another
2856 error occured). */
2857static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002858{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002859 PyObject *w = PyInt_FromLong((long)c);
2860 PyObject *x;
2861
2862 if (w == NULL)
2863 return NULL;
2864 x = PyObject_GetItem(mapping, w);
2865 Py_DECREF(w);
2866 if (x == NULL) {
2867 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2868 /* No mapping found means: mapping is undefined. */
2869 PyErr_Clear();
2870 x = Py_None;
2871 Py_INCREF(x);
2872 return x;
2873 } else
2874 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002876 else if (x == Py_None)
2877 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002878 else if (PyInt_Check(x)) {
2879 long value = PyInt_AS_LONG(x);
2880 if (value < 0 || value > 255) {
2881 PyErr_SetString(PyExc_TypeError,
2882 "character mapping must be in range(256)");
2883 Py_DECREF(x);
2884 return NULL;
2885 }
2886 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002888 else if (PyString_Check(x))
2889 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002890 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002891 /* wrong return value */
2892 PyErr_SetString(PyExc_TypeError,
2893 "character mapping must return integer, None or str");
2894 Py_DECREF(x);
2895 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002896 }
2897}
2898
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002899/* lookup the character, put the result in the output string and adjust
2900 various state variables. Reallocate the output string if not enough
2901 space is available. Return a new reference to the object that
2902 was put in the output buffer, or Py_None, if the mapping was undefined
2903 (in which case no character was written) or NULL, if a
2904 reallocation error ocurred. The called must decref the result */
2905static
2906PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2907 PyObject **outobj, int *outpos)
2908{
2909 PyObject *rep = charmapencode_lookup(c, mapping);
2910
2911 if (rep==NULL)
2912 return NULL;
2913 else if (rep==Py_None)
2914 return rep;
2915 else {
2916 char *outstart = PyString_AS_STRING(*outobj);
2917 int outsize = PyString_GET_SIZE(*outobj);
2918 if (PyInt_Check(rep)) {
2919 int requiredsize = *outpos+1;
2920 if (outsize<requiredsize) {
2921 /* exponentially overallocate to minimize reallocations */
2922 if (requiredsize < 2*outsize)
2923 requiredsize = 2*outsize;
2924 if (_PyString_Resize(outobj, requiredsize)) {
2925 Py_DECREF(rep);
2926 return NULL;
2927 }
2928 outstart = PyString_AS_STRING(*outobj);
2929 }
2930 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2931 }
2932 else {
2933 const char *repchars = PyString_AS_STRING(rep);
2934 int repsize = PyString_GET_SIZE(rep);
2935 int requiredsize = *outpos+repsize;
2936 if (outsize<requiredsize) {
2937 /* exponentially overallocate to minimize reallocations */
2938 if (requiredsize < 2*outsize)
2939 requiredsize = 2*outsize;
2940 if (_PyString_Resize(outobj, requiredsize)) {
2941 Py_DECREF(rep);
2942 return NULL;
2943 }
2944 outstart = PyString_AS_STRING(*outobj);
2945 }
2946 memcpy(outstart + *outpos, repchars, repsize);
2947 *outpos += repsize;
2948 }
2949 }
2950 return rep;
2951}
2952
2953/* handle an error in PyUnicode_EncodeCharmap
2954 Return 0 on success, -1 on error */
2955static
2956int charmap_encoding_error(
2957 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2958 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002959 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002960 PyObject **res, int *respos)
2961{
2962 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2963 int repsize;
2964 int newpos;
2965 Py_UNICODE *uni2;
2966 /* startpos for collecting unencodable chars */
2967 int collstartpos = *inpos;
2968 int collendpos = *inpos+1;
2969 int collpos;
2970 char *encoding = "charmap";
2971 char *reason = "character maps to <undefined>";
2972
2973 PyObject *x;
2974 /* find all unencodable characters */
2975 while (collendpos < size) {
2976 x = charmapencode_lookup(p[collendpos], mapping);
2977 if (x==NULL)
2978 return -1;
2979 else if (x!=Py_None) {
2980 Py_DECREF(x);
2981 break;
2982 }
2983 Py_DECREF(x);
2984 ++collendpos;
2985 }
2986 /* cache callback name lookup
2987 * (if not done yet, i.e. it's the first error) */
2988 if (*known_errorHandler==-1) {
2989 if ((errors==NULL) || (!strcmp(errors, "strict")))
2990 *known_errorHandler = 1;
2991 else if (!strcmp(errors, "replace"))
2992 *known_errorHandler = 2;
2993 else if (!strcmp(errors, "ignore"))
2994 *known_errorHandler = 3;
2995 else if (!strcmp(errors, "xmlcharrefreplace"))
2996 *known_errorHandler = 4;
2997 else
2998 *known_errorHandler = 0;
2999 }
3000 switch (*known_errorHandler) {
3001 case 1: /* strict */
3002 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3003 return -1;
3004 case 2: /* replace */
3005 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3006 x = charmapencode_output('?', mapping, res, respos);
3007 if (x==NULL) {
3008 return -1;
3009 }
3010 else if (x==Py_None) {
3011 Py_DECREF(x);
3012 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3013 return -1;
3014 }
3015 Py_DECREF(x);
3016 }
3017 /* fall through */
3018 case 3: /* ignore */
3019 *inpos = collendpos;
3020 break;
3021 case 4: /* xmlcharrefreplace */
3022 /* generate replacement (temporarily (mis)uses p) */
3023 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3024 char buffer[2+29+1+1];
3025 char *cp;
3026 sprintf(buffer, "&#%d;", (int)p[collpos]);
3027 for (cp = buffer; *cp; ++cp) {
3028 x = charmapencode_output(*cp, mapping, res, respos);
3029 if (x==NULL)
3030 return -1;
3031 else if (x==Py_None) {
3032 Py_DECREF(x);
3033 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3034 return -1;
3035 }
3036 Py_DECREF(x);
3037 }
3038 }
3039 *inpos = collendpos;
3040 break;
3041 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003042 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003043 encoding, reason, p, size, exceptionObject,
3044 collstartpos, collendpos, &newpos);
3045 if (repunicode == NULL)
3046 return -1;
3047 /* generate replacement */
3048 repsize = PyUnicode_GET_SIZE(repunicode);
3049 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3050 x = charmapencode_output(*uni2, mapping, res, respos);
3051 if (x==NULL) {
3052 Py_DECREF(repunicode);
3053 return -1;
3054 }
3055 else if (x==Py_None) {
3056 Py_DECREF(repunicode);
3057 Py_DECREF(x);
3058 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3059 return -1;
3060 }
3061 Py_DECREF(x);
3062 }
3063 *inpos = newpos;
3064 Py_DECREF(repunicode);
3065 }
3066 return 0;
3067}
3068
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3070 int size,
3071 PyObject *mapping,
3072 const char *errors)
3073{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003074 /* output object */
3075 PyObject *res = NULL;
3076 /* current input position */
3077 int inpos = 0;
3078 /* current output position */
3079 int respos = 0;
3080 PyObject *errorHandler = NULL;
3081 PyObject *exc = NULL;
3082 /* the following variable is used for caching string comparisons
3083 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3084 * 3=ignore, 4=xmlcharrefreplace */
3085 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086
3087 /* Default to Latin-1 */
3088 if (mapping == NULL)
3089 return PyUnicode_EncodeLatin1(p, size, errors);
3090
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003091 /* allocate enough for a simple encoding without
3092 replacements, if we need more, we'll resize */
3093 res = PyString_FromStringAndSize(NULL, size);
3094 if (res == NULL)
3095 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003096 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003097 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003099 while (inpos<size) {
3100 /* try to encode it */
3101 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3102 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003104 if (x==Py_None) { /* unencodable character */
3105 if (charmap_encoding_error(p, size, &inpos, mapping,
3106 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003107 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003108 &res, &respos)) {
3109 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003110 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003111 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003113 else
3114 /* done with this character => adjust input position */
3115 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116 Py_DECREF(x);
3117 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003119 /* Resize if we allocated to much */
3120 if (respos<PyString_GET_SIZE(res)) {
3121 if (_PyString_Resize(&res, respos))
3122 goto onError;
3123 }
3124 Py_XDECREF(exc);
3125 Py_XDECREF(errorHandler);
3126 return res;
3127
3128 onError:
3129 Py_XDECREF(res);
3130 Py_XDECREF(exc);
3131 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132 return NULL;
3133}
3134
3135PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3136 PyObject *mapping)
3137{
3138 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3139 PyErr_BadArgument();
3140 return NULL;
3141 }
3142 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3143 PyUnicode_GET_SIZE(unicode),
3144 mapping,
3145 NULL);
3146}
3147
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003148/* create or adjust a UnicodeTranslateError */
3149static void make_translate_exception(PyObject **exceptionObject,
3150 const Py_UNICODE *unicode, int size,
3151 int startpos, int endpos,
3152 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003154 if (*exceptionObject == NULL) {
3155 *exceptionObject = PyUnicodeTranslateError_Create(
3156 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157 }
3158 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003159 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3160 goto onError;
3161 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3162 goto onError;
3163 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3164 goto onError;
3165 return;
3166 onError:
3167 Py_DECREF(*exceptionObject);
3168 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003169 }
3170}
3171
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003172/* raises a UnicodeTranslateError */
3173static void raise_translate_exception(PyObject **exceptionObject,
3174 const Py_UNICODE *unicode, int size,
3175 int startpos, int endpos,
3176 const char *reason)
3177{
3178 make_translate_exception(exceptionObject,
3179 unicode, size, startpos, endpos, reason);
3180 if (*exceptionObject != NULL)
3181 PyCodec_StrictErrors(*exceptionObject);
3182}
3183
3184/* error handling callback helper:
3185 build arguments, call the callback and check the arguments,
3186 put the result into newpos and return the replacement string, which
3187 has to be freed by the caller */
3188static PyObject *unicode_translate_call_errorhandler(const char *errors,
3189 PyObject **errorHandler,
3190 const char *reason,
3191 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3192 int startpos, int endpos,
3193 int *newpos)
3194{
3195 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3196
3197 PyObject *restuple;
3198 PyObject *resunicode;
3199
3200 if (*errorHandler == NULL) {
3201 *errorHandler = PyCodec_LookupError(errors);
3202 if (*errorHandler == NULL)
3203 return NULL;
3204 }
3205
3206 make_translate_exception(exceptionObject,
3207 unicode, size, startpos, endpos, reason);
3208 if (*exceptionObject == NULL)
3209 return NULL;
3210
3211 restuple = PyObject_CallFunctionObjArgs(
3212 *errorHandler, *exceptionObject, NULL);
3213 if (restuple == NULL)
3214 return NULL;
3215 if (!PyTuple_Check(restuple)) {
3216 PyErr_Format(PyExc_TypeError, &argparse[4]);
3217 Py_DECREF(restuple);
3218 return NULL;
3219 }
3220 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3221 &resunicode, newpos)) {
3222 Py_DECREF(restuple);
3223 return NULL;
3224 }
3225 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003226 *newpos = size+*newpos;
3227 if (*newpos<0 || *newpos>size) {
3228 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3229 Py_DECREF(restuple);
3230 return NULL;
3231 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003232 Py_INCREF(resunicode);
3233 Py_DECREF(restuple);
3234 return resunicode;
3235}
3236
3237/* Lookup the character ch in the mapping and put the result in result,
3238 which must be decrefed by the caller.
3239 Return 0 on success, -1 on error */
3240static
3241int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3242{
3243 PyObject *w = PyInt_FromLong((long)c);
3244 PyObject *x;
3245
3246 if (w == NULL)
3247 return -1;
3248 x = PyObject_GetItem(mapping, w);
3249 Py_DECREF(w);
3250 if (x == NULL) {
3251 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3252 /* No mapping found means: use 1:1 mapping. */
3253 PyErr_Clear();
3254 *result = NULL;
3255 return 0;
3256 } else
3257 return -1;
3258 }
3259 else if (x == Py_None) {
3260 *result = x;
3261 return 0;
3262 }
3263 else if (PyInt_Check(x)) {
3264 long value = PyInt_AS_LONG(x);
3265 long max = PyUnicode_GetMax();
3266 if (value < 0 || value > max) {
3267 PyErr_Format(PyExc_TypeError,
3268 "character mapping must be in range(0x%lx)", max+1);
3269 Py_DECREF(x);
3270 return -1;
3271 }
3272 *result = x;
3273 return 0;
3274 }
3275 else if (PyUnicode_Check(x)) {
3276 *result = x;
3277 return 0;
3278 }
3279 else {
3280 /* wrong return value */
3281 PyErr_SetString(PyExc_TypeError,
3282 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003283 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003284 return -1;
3285 }
3286}
3287/* ensure that *outobj is at least requiredsize characters long,
3288if not reallocate and adjust various state variables.
3289Return 0 on success, -1 on error */
3290static
Walter Dörwald4894c302003-10-24 14:25:28 +00003291int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003292 int requiredsize)
3293{
Walter Dörwald4894c302003-10-24 14:25:28 +00003294 int oldsize = PyUnicode_GET_SIZE(*outobj);
3295 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296 /* remember old output position */
3297 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3298 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003299 if (requiredsize < 2 * oldsize)
3300 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003301 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003302 return -1;
3303 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003304 }
3305 return 0;
3306}
3307/* lookup the character, put the result in the output string and adjust
3308 various state variables. Return a new reference to the object that
3309 was put in the output buffer in *result, or Py_None, if the mapping was
3310 undefined (in which case no character was written).
3311 The called must decref result.
3312 Return 0 on success, -1 on error. */
3313static
Walter Dörwald4894c302003-10-24 14:25:28 +00003314int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3315 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3316 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003317{
Walter Dörwald4894c302003-10-24 14:25:28 +00003318 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003319 return -1;
3320 if (*res==NULL) {
3321 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003322 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003323 }
3324 else if (*res==Py_None)
3325 ;
3326 else if (PyInt_Check(*res)) {
3327 /* no overflow check, because we know that the space is enough */
3328 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3329 }
3330 else if (PyUnicode_Check(*res)) {
3331 int repsize = PyUnicode_GET_SIZE(*res);
3332 if (repsize==1) {
3333 /* no overflow check, because we know that the space is enough */
3334 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3335 }
3336 else if (repsize!=0) {
3337 /* more than one character */
Walter Dörwald4894c302003-10-24 14:25:28 +00003338 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003339 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003340 repsize - 1;
3341 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342 return -1;
3343 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3344 *outp += repsize;
3345 }
3346 }
3347 else
3348 return -1;
3349 return 0;
3350}
3351
3352PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 int size,
3354 PyObject *mapping,
3355 const char *errors)
3356{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003357 /* output object */
3358 PyObject *res = NULL;
3359 /* pointers to the beginning and end+1 of input */
3360 const Py_UNICODE *startp = p;
3361 const Py_UNICODE *endp = p + size;
3362 /* pointer into the output */
3363 Py_UNICODE *str;
3364 /* current output position */
3365 int respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003366 char *reason = "character maps to <undefined>";
3367 PyObject *errorHandler = NULL;
3368 PyObject *exc = NULL;
3369 /* the following variable is used for caching string comparisons
3370 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3371 * 3=ignore, 4=xmlcharrefreplace */
3372 int known_errorHandler = -1;
3373
Guido van Rossumd57fd912000-03-10 22:53:23 +00003374 if (mapping == NULL) {
3375 PyErr_BadArgument();
3376 return NULL;
3377 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378
3379 /* allocate enough for a simple 1:1 translation without
3380 replacements, if we need more, we'll resize */
3381 res = PyUnicode_FromUnicode(NULL, size);
3382 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003383 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003384 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003385 return res;
3386 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003387
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003388 while (p<endp) {
3389 /* try to encode it */
3390 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003391 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003392 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393 goto onError;
3394 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003395 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003396 if (x!=Py_None) /* it worked => adjust input pointer */
3397 ++p;
3398 else { /* untranslatable character */
3399 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3400 int repsize;
3401 int newpos;
3402 Py_UNICODE *uni2;
3403 /* startpos for collecting untranslatable chars */
3404 const Py_UNICODE *collstart = p;
3405 const Py_UNICODE *collend = p+1;
3406 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003408 /* find all untranslatable characters */
3409 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003410 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003411 goto onError;
3412 Py_XDECREF(x);
3413 if (x!=Py_None)
3414 break;
3415 ++collend;
3416 }
3417 /* cache callback name lookup
3418 * (if not done yet, i.e. it's the first error) */
3419 if (known_errorHandler==-1) {
3420 if ((errors==NULL) || (!strcmp(errors, "strict")))
3421 known_errorHandler = 1;
3422 else if (!strcmp(errors, "replace"))
3423 known_errorHandler = 2;
3424 else if (!strcmp(errors, "ignore"))
3425 known_errorHandler = 3;
3426 else if (!strcmp(errors, "xmlcharrefreplace"))
3427 known_errorHandler = 4;
3428 else
3429 known_errorHandler = 0;
3430 }
3431 switch (known_errorHandler) {
3432 case 1: /* strict */
3433 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3434 goto onError;
3435 case 2: /* replace */
3436 /* No need to check for space, this is a 1:1 replacement */
3437 for (coll = collstart; coll<collend; ++coll)
3438 *str++ = '?';
3439 /* fall through */
3440 case 3: /* ignore */
3441 p = collend;
3442 break;
3443 case 4: /* xmlcharrefreplace */
3444 /* generate replacement (temporarily (mis)uses p) */
3445 for (p = collstart; p < collend; ++p) {
3446 char buffer[2+29+1+1];
3447 char *cp;
3448 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003449 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003450 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3451 goto onError;
3452 for (cp = buffer; *cp; ++cp)
3453 *str++ = *cp;
3454 }
3455 p = collend;
3456 break;
3457 default:
3458 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3459 reason, startp, size, &exc,
3460 collstart-startp, collend-startp, &newpos);
3461 if (repunicode == NULL)
3462 goto onError;
3463 /* generate replacement */
3464 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003465 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3467 Py_DECREF(repunicode);
3468 goto onError;
3469 }
3470 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3471 *str++ = *uni2;
3472 p = startp + newpos;
3473 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003474 }
3475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003476 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003477 /* Resize if we allocated to much */
3478 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003479 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003480 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003481 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003482 }
3483 Py_XDECREF(exc);
3484 Py_XDECREF(errorHandler);
3485 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003486
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003487 onError:
3488 Py_XDECREF(res);
3489 Py_XDECREF(exc);
3490 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491 return NULL;
3492}
3493
3494PyObject *PyUnicode_Translate(PyObject *str,
3495 PyObject *mapping,
3496 const char *errors)
3497{
3498 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003499
Guido van Rossumd57fd912000-03-10 22:53:23 +00003500 str = PyUnicode_FromObject(str);
3501 if (str == NULL)
3502 goto onError;
3503 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3504 PyUnicode_GET_SIZE(str),
3505 mapping,
3506 errors);
3507 Py_DECREF(str);
3508 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003509
Guido van Rossumd57fd912000-03-10 22:53:23 +00003510 onError:
3511 Py_XDECREF(str);
3512 return NULL;
3513}
Tim Petersced69f82003-09-16 20:30:58 +00003514
Guido van Rossum9e896b32000-04-05 20:11:21 +00003515/* --- Decimal Encoder ---------------------------------------------------- */
3516
3517int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3518 int length,
3519 char *output,
3520 const char *errors)
3521{
3522 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003523 PyObject *errorHandler = NULL;
3524 PyObject *exc = NULL;
3525 const char *encoding = "decimal";
3526 const char *reason = "invalid decimal Unicode string";
3527 /* the following variable is used for caching string comparisons
3528 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3529 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003530
3531 if (output == NULL) {
3532 PyErr_BadArgument();
3533 return -1;
3534 }
3535
3536 p = s;
3537 end = s + length;
3538 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003540 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 PyObject *repunicode;
3542 int repsize;
3543 int newpos;
3544 Py_UNICODE *uni2;
3545 Py_UNICODE *collstart;
3546 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003547
Guido van Rossum9e896b32000-04-05 20:11:21 +00003548 if (Py_UNICODE_ISSPACE(ch)) {
3549 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003550 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003551 continue;
3552 }
3553 decimal = Py_UNICODE_TODECIMAL(ch);
3554 if (decimal >= 0) {
3555 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003557 continue;
3558 }
Guido van Rossumba477042000-04-06 18:18:10 +00003559 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003560 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003562 continue;
3563 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003564 /* All other characters are considered unencodable */
3565 collstart = p;
3566 collend = p+1;
3567 while (collend < end) {
3568 if ((0 < *collend && *collend < 256) ||
3569 !Py_UNICODE_ISSPACE(*collend) ||
3570 Py_UNICODE_TODECIMAL(*collend))
3571 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003572 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573 /* cache callback name lookup
3574 * (if not done yet, i.e. it's the first error) */
3575 if (known_errorHandler==-1) {
3576 if ((errors==NULL) || (!strcmp(errors, "strict")))
3577 known_errorHandler = 1;
3578 else if (!strcmp(errors, "replace"))
3579 known_errorHandler = 2;
3580 else if (!strcmp(errors, "ignore"))
3581 known_errorHandler = 3;
3582 else if (!strcmp(errors, "xmlcharrefreplace"))
3583 known_errorHandler = 4;
3584 else
3585 known_errorHandler = 0;
3586 }
3587 switch (known_errorHandler) {
3588 case 1: /* strict */
3589 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3590 goto onError;
3591 case 2: /* replace */
3592 for (p = collstart; p < collend; ++p)
3593 *output++ = '?';
3594 /* fall through */
3595 case 3: /* ignore */
3596 p = collend;
3597 break;
3598 case 4: /* xmlcharrefreplace */
3599 /* generate replacement (temporarily (mis)uses p) */
3600 for (p = collstart; p < collend; ++p)
3601 output += sprintf(output, "&#%d;", (int)*p);
3602 p = collend;
3603 break;
3604 default:
3605 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3606 encoding, reason, s, length, &exc,
3607 collstart-s, collend-s, &newpos);
3608 if (repunicode == NULL)
3609 goto onError;
3610 /* generate replacement */
3611 repsize = PyUnicode_GET_SIZE(repunicode);
3612 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3613 Py_UNICODE ch = *uni2;
3614 if (Py_UNICODE_ISSPACE(ch))
3615 *output++ = ' ';
3616 else {
3617 decimal = Py_UNICODE_TODECIMAL(ch);
3618 if (decimal >= 0)
3619 *output++ = '0' + decimal;
3620 else if (0 < ch && ch < 256)
3621 *output++ = (char)ch;
3622 else {
3623 Py_DECREF(repunicode);
3624 raise_encode_exception(&exc, encoding,
3625 s, length, collstart-s, collend-s, reason);
3626 goto onError;
3627 }
3628 }
3629 }
3630 p = s + newpos;
3631 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003632 }
3633 }
3634 /* 0-terminate the output string */
3635 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003636 Py_XDECREF(exc);
3637 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003638 return 0;
3639
3640 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003641 Py_XDECREF(exc);
3642 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003643 return -1;
3644}
3645
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646/* --- Helpers ------------------------------------------------------------ */
3647
Tim Petersced69f82003-09-16 20:30:58 +00003648static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649int count(PyUnicodeObject *self,
3650 int start,
3651 int end,
3652 PyUnicodeObject *substring)
3653{
3654 int count = 0;
3655
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003656 if (start < 0)
3657 start += self->length;
3658 if (start < 0)
3659 start = 0;
3660 if (end > self->length)
3661 end = self->length;
3662 if (end < 0)
3663 end += self->length;
3664 if (end < 0)
3665 end = 0;
3666
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003667 if (substring->length == 0)
3668 return (end - start + 1);
3669
Guido van Rossumd57fd912000-03-10 22:53:23 +00003670 end -= substring->length;
3671
3672 while (start <= end)
3673 if (Py_UNICODE_MATCH(self, start, substring)) {
3674 count++;
3675 start += substring->length;
3676 } else
3677 start++;
3678
3679 return count;
3680}
3681
3682int PyUnicode_Count(PyObject *str,
3683 PyObject *substr,
3684 int start,
3685 int end)
3686{
3687 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003688
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 str = PyUnicode_FromObject(str);
3690 if (str == NULL)
3691 return -1;
3692 substr = PyUnicode_FromObject(substr);
3693 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003694 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 return -1;
3696 }
Tim Petersced69f82003-09-16 20:30:58 +00003697
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698 result = count((PyUnicodeObject *)str,
3699 start, end,
3700 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003701
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702 Py_DECREF(str);
3703 Py_DECREF(substr);
3704 return result;
3705}
3706
Tim Petersced69f82003-09-16 20:30:58 +00003707static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708int findstring(PyUnicodeObject *self,
3709 PyUnicodeObject *substring,
3710 int start,
3711 int end,
3712 int direction)
3713{
3714 if (start < 0)
3715 start += self->length;
3716 if (start < 0)
3717 start = 0;
3718
Guido van Rossumd57fd912000-03-10 22:53:23 +00003719 if (end > self->length)
3720 end = self->length;
3721 if (end < 0)
3722 end += self->length;
3723 if (end < 0)
3724 end = 0;
3725
Guido van Rossum76afbd92002-08-20 17:29:29 +00003726 if (substring->length == 0)
3727 return (direction > 0) ? start : end;
3728
Guido van Rossumd57fd912000-03-10 22:53:23 +00003729 end -= substring->length;
3730
3731 if (direction < 0) {
3732 for (; end >= start; end--)
3733 if (Py_UNICODE_MATCH(self, end, substring))
3734 return end;
3735 } else {
3736 for (; start <= end; start++)
3737 if (Py_UNICODE_MATCH(self, start, substring))
3738 return start;
3739 }
3740
3741 return -1;
3742}
3743
3744int PyUnicode_Find(PyObject *str,
3745 PyObject *substr,
3746 int start,
3747 int end,
3748 int direction)
3749{
3750 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003751
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752 str = PyUnicode_FromObject(str);
3753 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003754 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755 substr = PyUnicode_FromObject(substr);
3756 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003757 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003758 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003759 }
Tim Petersced69f82003-09-16 20:30:58 +00003760
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761 result = findstring((PyUnicodeObject *)str,
3762 (PyUnicodeObject *)substr,
3763 start, end, direction);
3764 Py_DECREF(str);
3765 Py_DECREF(substr);
3766 return result;
3767}
3768
Tim Petersced69f82003-09-16 20:30:58 +00003769static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770int tailmatch(PyUnicodeObject *self,
3771 PyUnicodeObject *substring,
3772 int start,
3773 int end,
3774 int direction)
3775{
3776 if (start < 0)
3777 start += self->length;
3778 if (start < 0)
3779 start = 0;
3780
3781 if (substring->length == 0)
3782 return 1;
3783
3784 if (end > self->length)
3785 end = self->length;
3786 if (end < 0)
3787 end += self->length;
3788 if (end < 0)
3789 end = 0;
3790
3791 end -= substring->length;
3792 if (end < start)
3793 return 0;
3794
3795 if (direction > 0) {
3796 if (Py_UNICODE_MATCH(self, end, substring))
3797 return 1;
3798 } else {
3799 if (Py_UNICODE_MATCH(self, start, substring))
3800 return 1;
3801 }
3802
3803 return 0;
3804}
3805
3806int PyUnicode_Tailmatch(PyObject *str,
3807 PyObject *substr,
3808 int start,
3809 int end,
3810 int direction)
3811{
3812 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003813
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 str = PyUnicode_FromObject(str);
3815 if (str == NULL)
3816 return -1;
3817 substr = PyUnicode_FromObject(substr);
3818 if (substr == NULL) {
3819 Py_DECREF(substr);
3820 return -1;
3821 }
Tim Petersced69f82003-09-16 20:30:58 +00003822
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823 result = tailmatch((PyUnicodeObject *)str,
3824 (PyUnicodeObject *)substr,
3825 start, end, direction);
3826 Py_DECREF(str);
3827 Py_DECREF(substr);
3828 return result;
3829}
3830
Tim Petersced69f82003-09-16 20:30:58 +00003831static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832const Py_UNICODE *findchar(const Py_UNICODE *s,
3833 int size,
3834 Py_UNICODE ch)
3835{
3836 /* like wcschr, but doesn't stop at NULL characters */
3837
3838 while (size-- > 0) {
3839 if (*s == ch)
3840 return s;
3841 s++;
3842 }
3843
3844 return NULL;
3845}
3846
3847/* Apply fixfct filter to the Unicode object self and return a
3848 reference to the modified object */
3849
Tim Petersced69f82003-09-16 20:30:58 +00003850static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851PyObject *fixup(PyUnicodeObject *self,
3852 int (*fixfct)(PyUnicodeObject *s))
3853{
3854
3855 PyUnicodeObject *u;
3856
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003857 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858 if (u == NULL)
3859 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003860
3861 Py_UNICODE_COPY(u->str, self->str, self->length);
3862
Tim Peters7a29bd52001-09-12 03:03:31 +00003863 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864 /* fixfct should return TRUE if it modified the buffer. If
3865 FALSE, return a reference to the original buffer instead
3866 (to save space, not time) */
3867 Py_INCREF(self);
3868 Py_DECREF(u);
3869 return (PyObject*) self;
3870 }
3871 return (PyObject*) u;
3872}
3873
Tim Petersced69f82003-09-16 20:30:58 +00003874static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875int fixupper(PyUnicodeObject *self)
3876{
3877 int len = self->length;
3878 Py_UNICODE *s = self->str;
3879 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003880
Guido van Rossumd57fd912000-03-10 22:53:23 +00003881 while (len-- > 0) {
3882 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003883
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884 ch = Py_UNICODE_TOUPPER(*s);
3885 if (ch != *s) {
3886 status = 1;
3887 *s = ch;
3888 }
3889 s++;
3890 }
3891
3892 return status;
3893}
3894
Tim Petersced69f82003-09-16 20:30:58 +00003895static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896int fixlower(PyUnicodeObject *self)
3897{
3898 int len = self->length;
3899 Py_UNICODE *s = self->str;
3900 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003901
Guido van Rossumd57fd912000-03-10 22:53:23 +00003902 while (len-- > 0) {
3903 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003904
Guido van Rossumd57fd912000-03-10 22:53:23 +00003905 ch = Py_UNICODE_TOLOWER(*s);
3906 if (ch != *s) {
3907 status = 1;
3908 *s = ch;
3909 }
3910 s++;
3911 }
3912
3913 return status;
3914}
3915
Tim Petersced69f82003-09-16 20:30:58 +00003916static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917int fixswapcase(PyUnicodeObject *self)
3918{
3919 int len = self->length;
3920 Py_UNICODE *s = self->str;
3921 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003922
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923 while (len-- > 0) {
3924 if (Py_UNICODE_ISUPPER(*s)) {
3925 *s = Py_UNICODE_TOLOWER(*s);
3926 status = 1;
3927 } else if (Py_UNICODE_ISLOWER(*s)) {
3928 *s = Py_UNICODE_TOUPPER(*s);
3929 status = 1;
3930 }
3931 s++;
3932 }
3933
3934 return status;
3935}
3936
Tim Petersced69f82003-09-16 20:30:58 +00003937static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938int fixcapitalize(PyUnicodeObject *self)
3939{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003940 int len = self->length;
3941 Py_UNICODE *s = self->str;
3942 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003943
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003944 if (len == 0)
3945 return 0;
3946 if (Py_UNICODE_ISLOWER(*s)) {
3947 *s = Py_UNICODE_TOUPPER(*s);
3948 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003949 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003950 s++;
3951 while (--len > 0) {
3952 if (Py_UNICODE_ISUPPER(*s)) {
3953 *s = Py_UNICODE_TOLOWER(*s);
3954 status = 1;
3955 }
3956 s++;
3957 }
3958 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959}
3960
3961static
3962int fixtitle(PyUnicodeObject *self)
3963{
3964 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3965 register Py_UNICODE *e;
3966 int previous_is_cased;
3967
3968 /* Shortcut for single character strings */
3969 if (PyUnicode_GET_SIZE(self) == 1) {
3970 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3971 if (*p != ch) {
3972 *p = ch;
3973 return 1;
3974 }
3975 else
3976 return 0;
3977 }
Tim Petersced69f82003-09-16 20:30:58 +00003978
Guido van Rossumd57fd912000-03-10 22:53:23 +00003979 e = p + PyUnicode_GET_SIZE(self);
3980 previous_is_cased = 0;
3981 for (; p < e; p++) {
3982 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00003983
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984 if (previous_is_cased)
3985 *p = Py_UNICODE_TOLOWER(ch);
3986 else
3987 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00003988
3989 if (Py_UNICODE_ISLOWER(ch) ||
3990 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991 Py_UNICODE_ISTITLE(ch))
3992 previous_is_cased = 1;
3993 else
3994 previous_is_cased = 0;
3995 }
3996 return 1;
3997}
3998
3999PyObject *PyUnicode_Join(PyObject *separator,
4000 PyObject *seq)
4001{
4002 Py_UNICODE *sep;
4003 int seplen;
4004 PyUnicodeObject *res = NULL;
4005 int reslen = 0;
4006 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007 int sz = 100;
4008 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00004009 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010
Tim Peters2cfe3682001-05-05 05:36:48 +00004011 it = PyObject_GetIter(seq);
4012 if (it == NULL)
4013 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014
4015 if (separator == NULL) {
4016 Py_UNICODE blank = ' ';
4017 sep = &blank;
4018 seplen = 1;
4019 }
4020 else {
4021 separator = PyUnicode_FromObject(separator);
4022 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00004023 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024 sep = PyUnicode_AS_UNICODE(separator);
4025 seplen = PyUnicode_GET_SIZE(separator);
4026 }
Tim Petersced69f82003-09-16 20:30:58 +00004027
Guido van Rossumd57fd912000-03-10 22:53:23 +00004028 res = _PyUnicode_New(sz);
4029 if (res == NULL)
4030 goto onError;
4031 p = PyUnicode_AS_UNICODE(res);
4032 reslen = 0;
4033
Tim Peters2cfe3682001-05-05 05:36:48 +00004034 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00004036 PyObject *item = PyIter_Next(it);
4037 if (item == NULL) {
4038 if (PyErr_Occurred())
4039 goto onError;
4040 break;
4041 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042 if (!PyUnicode_Check(item)) {
4043 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004044 if (!PyString_Check(item)) {
4045 PyErr_Format(PyExc_TypeError,
4046 "sequence item %i: expected string or Unicode,"
4047 " %.80s found",
4048 i, item->ob_type->tp_name);
4049 Py_DECREF(item);
4050 goto onError;
4051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052 v = PyUnicode_FromObject(item);
4053 Py_DECREF(item);
4054 item = v;
4055 if (item == NULL)
4056 goto onError;
4057 }
4058 itemlen = PyUnicode_GET_SIZE(item);
4059 while (reslen + itemlen + seplen >= sz) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004060 if (_PyUnicode_Resize(&res, sz*2) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004061 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 sz *= 2;
4065 p = PyUnicode_AS_UNICODE(res) + reslen;
4066 }
4067 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004068 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 p += seplen;
4070 reslen += seplen;
4071 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004072 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 p += itemlen;
4074 reslen += itemlen;
4075 Py_DECREF(item);
4076 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004077 if (_PyUnicode_Resize(&res, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078 goto onError;
4079
4080 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004081 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 return (PyObject *)res;
4083
4084 onError:
4085 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004086 Py_XDECREF(res);
4087 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004088 return NULL;
4089}
4090
Tim Petersced69f82003-09-16 20:30:58 +00004091static
4092PyUnicodeObject *pad(PyUnicodeObject *self,
4093 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094 int right,
4095 Py_UNICODE fill)
4096{
4097 PyUnicodeObject *u;
4098
4099 if (left < 0)
4100 left = 0;
4101 if (right < 0)
4102 right = 0;
4103
Tim Peters7a29bd52001-09-12 03:03:31 +00004104 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105 Py_INCREF(self);
4106 return self;
4107 }
4108
4109 u = _PyUnicode_New(left + self->length + right);
4110 if (u) {
4111 if (left)
4112 Py_UNICODE_FILL(u->str, fill, left);
4113 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4114 if (right)
4115 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4116 }
4117
4118 return u;
4119}
4120
4121#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004122 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123 if (!str) \
4124 goto onError; \
4125 if (PyList_Append(list, str)) { \
4126 Py_DECREF(str); \
4127 goto onError; \
4128 } \
4129 else \
4130 Py_DECREF(str);
4131
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004132#define SPLIT_INSERT(data, left, right) \
4133 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4134 if (!str) \
4135 goto onError; \
4136 if (PyList_Insert(list, 0, str)) { \
4137 Py_DECREF(str); \
4138 goto onError; \
4139 } \
4140 else \
4141 Py_DECREF(str);
4142
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143static
4144PyObject *split_whitespace(PyUnicodeObject *self,
4145 PyObject *list,
4146 int maxcount)
4147{
4148 register int i;
4149 register int j;
4150 int len = self->length;
4151 PyObject *str;
4152
4153 for (i = j = 0; i < len; ) {
4154 /* find a token */
4155 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4156 i++;
4157 j = i;
4158 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4159 i++;
4160 if (j < i) {
4161 if (maxcount-- <= 0)
4162 break;
4163 SPLIT_APPEND(self->str, j, i);
4164 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4165 i++;
4166 j = i;
4167 }
4168 }
4169 if (j < len) {
4170 SPLIT_APPEND(self->str, j, len);
4171 }
4172 return list;
4173
4174 onError:
4175 Py_DECREF(list);
4176 return NULL;
4177}
4178
4179PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004180 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181{
4182 register int i;
4183 register int j;
4184 int len;
4185 PyObject *list;
4186 PyObject *str;
4187 Py_UNICODE *data;
4188
4189 string = PyUnicode_FromObject(string);
4190 if (string == NULL)
4191 return NULL;
4192 data = PyUnicode_AS_UNICODE(string);
4193 len = PyUnicode_GET_SIZE(string);
4194
Guido van Rossumd57fd912000-03-10 22:53:23 +00004195 list = PyList_New(0);
4196 if (!list)
4197 goto onError;
4198
4199 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004200 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004201
Guido van Rossumd57fd912000-03-10 22:53:23 +00004202 /* Find a line and append it */
4203 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4204 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205
4206 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004207 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004208 if (i < len) {
4209 if (data[i] == '\r' && i + 1 < len &&
4210 data[i+1] == '\n')
4211 i += 2;
4212 else
4213 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004214 if (keepends)
4215 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216 }
Guido van Rossum86662912000-04-11 15:38:46 +00004217 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218 j = i;
4219 }
4220 if (j < len) {
4221 SPLIT_APPEND(data, j, len);
4222 }
4223
4224 Py_DECREF(string);
4225 return list;
4226
4227 onError:
4228 Py_DECREF(list);
4229 Py_DECREF(string);
4230 return NULL;
4231}
4232
Tim Petersced69f82003-09-16 20:30:58 +00004233static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004234PyObject *split_char(PyUnicodeObject *self,
4235 PyObject *list,
4236 Py_UNICODE ch,
4237 int maxcount)
4238{
4239 register int i;
4240 register int j;
4241 int len = self->length;
4242 PyObject *str;
4243
4244 for (i = j = 0; i < len; ) {
4245 if (self->str[i] == ch) {
4246 if (maxcount-- <= 0)
4247 break;
4248 SPLIT_APPEND(self->str, j, i);
4249 i = j = i + 1;
4250 } else
4251 i++;
4252 }
4253 if (j <= len) {
4254 SPLIT_APPEND(self->str, j, len);
4255 }
4256 return list;
4257
4258 onError:
4259 Py_DECREF(list);
4260 return NULL;
4261}
4262
Tim Petersced69f82003-09-16 20:30:58 +00004263static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004264PyObject *split_substring(PyUnicodeObject *self,
4265 PyObject *list,
4266 PyUnicodeObject *substring,
4267 int maxcount)
4268{
4269 register int i;
4270 register int j;
4271 int len = self->length;
4272 int sublen = substring->length;
4273 PyObject *str;
4274
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004275 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276 if (Py_UNICODE_MATCH(self, i, substring)) {
4277 if (maxcount-- <= 0)
4278 break;
4279 SPLIT_APPEND(self->str, j, i);
4280 i = j = i + sublen;
4281 } else
4282 i++;
4283 }
4284 if (j <= len) {
4285 SPLIT_APPEND(self->str, j, len);
4286 }
4287 return list;
4288
4289 onError:
4290 Py_DECREF(list);
4291 return NULL;
4292}
4293
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004294static
4295PyObject *rsplit_whitespace(PyUnicodeObject *self,
4296 PyObject *list,
4297 int maxcount)
4298{
4299 register int i;
4300 register int j;
4301 int len = self->length;
4302 PyObject *str;
4303
4304 for (i = j = len - 1; i >= 0; ) {
4305 /* find a token */
4306 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4307 i--;
4308 j = i;
4309 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4310 i--;
4311 if (j > i) {
4312 if (maxcount-- <= 0)
4313 break;
4314 SPLIT_INSERT(self->str, i + 1, j + 1);
4315 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4316 i--;
4317 j = i;
4318 }
4319 }
4320 if (j >= 0) {
4321 SPLIT_INSERT(self->str, 0, j + 1);
4322 }
4323 return list;
4324
4325 onError:
4326 Py_DECREF(list);
4327 return NULL;
4328}
4329
4330static
4331PyObject *rsplit_char(PyUnicodeObject *self,
4332 PyObject *list,
4333 Py_UNICODE ch,
4334 int maxcount)
4335{
4336 register int i;
4337 register int j;
4338 int len = self->length;
4339 PyObject *str;
4340
4341 for (i = j = len - 1; i >= 0; ) {
4342 if (self->str[i] == ch) {
4343 if (maxcount-- <= 0)
4344 break;
4345 SPLIT_INSERT(self->str, i + 1, j + 1);
4346 j = i = i - 1;
4347 } else
4348 i--;
4349 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004350 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004351 SPLIT_INSERT(self->str, 0, j + 1);
4352 }
4353 return list;
4354
4355 onError:
4356 Py_DECREF(list);
4357 return NULL;
4358}
4359
4360static
4361PyObject *rsplit_substring(PyUnicodeObject *self,
4362 PyObject *list,
4363 PyUnicodeObject *substring,
4364 int maxcount)
4365{
4366 register int i;
4367 register int j;
4368 int len = self->length;
4369 int sublen = substring->length;
4370 PyObject *str;
4371
4372 for (i = len - sublen, j = len; i >= 0; ) {
4373 if (Py_UNICODE_MATCH(self, i, substring)) {
4374 if (maxcount-- <= 0)
4375 break;
4376 SPLIT_INSERT(self->str, i + sublen, j);
4377 j = i;
4378 i -= sublen;
4379 } else
4380 i--;
4381 }
4382 if (j >= 0) {
4383 SPLIT_INSERT(self->str, 0, j);
4384 }
4385 return list;
4386
4387 onError:
4388 Py_DECREF(list);
4389 return NULL;
4390}
4391
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004393#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394
4395static
4396PyObject *split(PyUnicodeObject *self,
4397 PyUnicodeObject *substring,
4398 int maxcount)
4399{
4400 PyObject *list;
4401
4402 if (maxcount < 0)
4403 maxcount = INT_MAX;
4404
4405 list = PyList_New(0);
4406 if (!list)
4407 return NULL;
4408
4409 if (substring == NULL)
4410 return split_whitespace(self,list,maxcount);
4411
4412 else if (substring->length == 1)
4413 return split_char(self,list,substring->str[0],maxcount);
4414
4415 else if (substring->length == 0) {
4416 Py_DECREF(list);
4417 PyErr_SetString(PyExc_ValueError, "empty separator");
4418 return NULL;
4419 }
4420 else
4421 return split_substring(self,list,substring,maxcount);
4422}
4423
Tim Petersced69f82003-09-16 20:30:58 +00004424static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004425PyObject *rsplit(PyUnicodeObject *self,
4426 PyUnicodeObject *substring,
4427 int maxcount)
4428{
4429 PyObject *list;
4430
4431 if (maxcount < 0)
4432 maxcount = INT_MAX;
4433
4434 list = PyList_New(0);
4435 if (!list)
4436 return NULL;
4437
4438 if (substring == NULL)
4439 return rsplit_whitespace(self,list,maxcount);
4440
4441 else if (substring->length == 1)
4442 return rsplit_char(self,list,substring->str[0],maxcount);
4443
4444 else if (substring->length == 0) {
4445 Py_DECREF(list);
4446 PyErr_SetString(PyExc_ValueError, "empty separator");
4447 return NULL;
4448 }
4449 else
4450 return rsplit_substring(self,list,substring,maxcount);
4451}
4452
4453static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454PyObject *replace(PyUnicodeObject *self,
4455 PyUnicodeObject *str1,
4456 PyUnicodeObject *str2,
4457 int maxcount)
4458{
4459 PyUnicodeObject *u;
4460
4461 if (maxcount < 0)
4462 maxcount = INT_MAX;
4463
4464 if (str1->length == 1 && str2->length == 1) {
4465 int i;
4466
4467 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004468 if (!findchar(self->str, self->length, str1->str[0]) &&
4469 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470 /* nothing to replace, return original string */
4471 Py_INCREF(self);
4472 u = self;
4473 } else {
4474 Py_UNICODE u1 = str1->str[0];
4475 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004476
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004478 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479 self->length
4480 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004481 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004482 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004483 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484 for (i = 0; i < u->length; i++)
4485 if (u->str[i] == u1) {
4486 if (--maxcount < 0)
4487 break;
4488 u->str[i] = u2;
4489 }
4490 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004491 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492
4493 } else {
4494 int n, i;
4495 Py_UNICODE *p;
4496
4497 /* replace strings */
4498 n = count(self, 0, self->length, str1);
4499 if (n > maxcount)
4500 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004501 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004503 if (PyUnicode_CheckExact(self)) {
4504 Py_INCREF(self);
4505 u = self;
4506 }
4507 else {
4508 u = (PyUnicodeObject *)
4509 PyUnicode_FromUnicode(self->str, self->length);
4510 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004511 } else {
4512 u = _PyUnicode_New(
4513 self->length + n * (str2->length - str1->length));
4514 if (u) {
4515 i = 0;
4516 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004517 if (str1->length > 0) {
4518 while (i <= self->length - str1->length)
4519 if (Py_UNICODE_MATCH(self, i, str1)) {
4520 /* replace string segment */
4521 Py_UNICODE_COPY(p, str2->str, str2->length);
4522 p += str2->length;
4523 i += str1->length;
4524 if (--n <= 0) {
4525 /* copy remaining part */
4526 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4527 break;
4528 }
4529 } else
4530 *p++ = self->str[i++];
4531 } else {
4532 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004533 Py_UNICODE_COPY(p, str2->str, str2->length);
4534 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004535 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004537 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004538 }
4539 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4540 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004541 }
4542 }
4543 }
Tim Petersced69f82003-09-16 20:30:58 +00004544
Guido van Rossumd57fd912000-03-10 22:53:23 +00004545 return (PyObject *) u;
4546}
4547
4548/* --- Unicode Object Methods --------------------------------------------- */
4549
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004550PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004551"S.title() -> unicode\n\
4552\n\
4553Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004554characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555
4556static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004557unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004558{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559 return fixup(self, fixtitle);
4560}
4561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004562PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563"S.capitalize() -> unicode\n\
4564\n\
4565Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004566have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567
4568static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004569unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004571 return fixup(self, fixcapitalize);
4572}
4573
4574#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004575PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004576"S.capwords() -> unicode\n\
4577\n\
4578Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004579normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004580
4581static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004582unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004583{
4584 PyObject *list;
4585 PyObject *item;
4586 int i;
4587
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588 /* Split into words */
4589 list = split(self, NULL, -1);
4590 if (!list)
4591 return NULL;
4592
4593 /* Capitalize each word */
4594 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4595 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4596 fixcapitalize);
4597 if (item == NULL)
4598 goto onError;
4599 Py_DECREF(PyList_GET_ITEM(list, i));
4600 PyList_SET_ITEM(list, i, item);
4601 }
4602
4603 /* Join the words to form a new string */
4604 item = PyUnicode_Join(NULL, list);
4605
4606onError:
4607 Py_DECREF(list);
4608 return (PyObject *)item;
4609}
4610#endif
4611
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004612/* Argument converter. Coerces to a single unicode character */
4613
4614static int
4615convert_uc(PyObject *obj, void *addr)
4616{
4617 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4618 PyObject *uniobj;
4619 Py_UNICODE *unistr;
4620
4621 uniobj = PyUnicode_FromObject(obj);
4622 if (uniobj == NULL) {
4623 PyErr_SetString(PyExc_TypeError,
4624 "The fill character cannot be converted to Unicode");
4625 return 0;
4626 }
4627 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4628 PyErr_SetString(PyExc_TypeError,
4629 "The fill character must be exactly one character long");
4630 Py_DECREF(uniobj);
4631 return 0;
4632 }
4633 unistr = PyUnicode_AS_UNICODE(uniobj);
4634 *fillcharloc = unistr[0];
4635 Py_DECREF(uniobj);
4636 return 1;
4637}
4638
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004639PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004640"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004641\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004642Return S centered in a Unicode string of length width. Padding is\n\
4643done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004644
4645static PyObject *
4646unicode_center(PyUnicodeObject *self, PyObject *args)
4647{
4648 int marg, left;
4649 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004650 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004651
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004652 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653 return NULL;
4654
Tim Peters7a29bd52001-09-12 03:03:31 +00004655 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004656 Py_INCREF(self);
4657 return (PyObject*) self;
4658 }
4659
4660 marg = width - self->length;
4661 left = marg / 2 + (marg & width & 1);
4662
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004663 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664}
4665
Marc-André Lemburge5034372000-08-08 08:04:29 +00004666#if 0
4667
4668/* This code should go into some future Unicode collation support
4669 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004670 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004671
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004672/* speedy UTF-16 code point order comparison */
4673/* gleaned from: */
4674/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4675
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004676static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004677{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004678 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004679 0, 0, 0, 0, 0, 0, 0, 0,
4680 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004681 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004682};
4683
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684static int
4685unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4686{
4687 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004688
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689 Py_UNICODE *s1 = str1->str;
4690 Py_UNICODE *s2 = str2->str;
4691
4692 len1 = str1->length;
4693 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004694
Guido van Rossumd57fd912000-03-10 22:53:23 +00004695 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004696 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004697
4698 c1 = *s1++;
4699 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004700
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004701 if (c1 > (1<<11) * 26)
4702 c1 += utf16Fixup[c1>>11];
4703 if (c2 > (1<<11) * 26)
4704 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004705 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004706
4707 if (c1 != c2)
4708 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004709
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004710 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711 }
4712
4713 return (len1 < len2) ? -1 : (len1 != len2);
4714}
4715
Marc-André Lemburge5034372000-08-08 08:04:29 +00004716#else
4717
4718static int
4719unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4720{
4721 register int len1, len2;
4722
4723 Py_UNICODE *s1 = str1->str;
4724 Py_UNICODE *s2 = str2->str;
4725
4726 len1 = str1->length;
4727 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004728
Marc-André Lemburge5034372000-08-08 08:04:29 +00004729 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004730 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004731
Fredrik Lundh45714e92001-06-26 16:39:36 +00004732 c1 = *s1++;
4733 c2 = *s2++;
4734
4735 if (c1 != c2)
4736 return (c1 < c2) ? -1 : 1;
4737
Marc-André Lemburge5034372000-08-08 08:04:29 +00004738 len1--; len2--;
4739 }
4740
4741 return (len1 < len2) ? -1 : (len1 != len2);
4742}
4743
4744#endif
4745
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746int PyUnicode_Compare(PyObject *left,
4747 PyObject *right)
4748{
4749 PyUnicodeObject *u = NULL, *v = NULL;
4750 int result;
4751
4752 /* Coerce the two arguments */
4753 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4754 if (u == NULL)
4755 goto onError;
4756 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4757 if (v == NULL)
4758 goto onError;
4759
Thomas Wouters7e474022000-07-16 12:04:32 +00004760 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761 if (v == u) {
4762 Py_DECREF(u);
4763 Py_DECREF(v);
4764 return 0;
4765 }
4766
4767 result = unicode_compare(u, v);
4768
4769 Py_DECREF(u);
4770 Py_DECREF(v);
4771 return result;
4772
4773onError:
4774 Py_XDECREF(u);
4775 Py_XDECREF(v);
4776 return -1;
4777}
4778
Guido van Rossum403d68b2000-03-13 15:55:09 +00004779int PyUnicode_Contains(PyObject *container,
4780 PyObject *element)
4781{
4782 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004783 int result, size;
4784 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004785
4786 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004787 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004788 if (v == NULL) {
4789 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004790 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004791 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004792 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004793 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004794 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004795 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004796
Barry Warsaw817918c2002-08-06 16:58:21 +00004797 size = PyUnicode_GET_SIZE(v);
4798 rhs = PyUnicode_AS_UNICODE(v);
4799 lhs = PyUnicode_AS_UNICODE(u);
4800
Guido van Rossum403d68b2000-03-13 15:55:09 +00004801 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004802 if (size == 1) {
4803 end = lhs + PyUnicode_GET_SIZE(u);
4804 while (lhs < end) {
4805 if (*lhs++ == *rhs) {
4806 result = 1;
4807 break;
4808 }
4809 }
4810 }
4811 else {
4812 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4813 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004814 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004815 result = 1;
4816 break;
4817 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004818 }
4819 }
4820
4821 Py_DECREF(u);
4822 Py_DECREF(v);
4823 return result;
4824
4825onError:
4826 Py_XDECREF(u);
4827 Py_XDECREF(v);
4828 return -1;
4829}
4830
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831/* Concat to string or Unicode object giving a new Unicode object. */
4832
4833PyObject *PyUnicode_Concat(PyObject *left,
4834 PyObject *right)
4835{
4836 PyUnicodeObject *u = NULL, *v = NULL, *w;
4837
4838 /* Coerce the two arguments */
4839 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4840 if (u == NULL)
4841 goto onError;
4842 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4843 if (v == NULL)
4844 goto onError;
4845
4846 /* Shortcuts */
4847 if (v == unicode_empty) {
4848 Py_DECREF(v);
4849 return (PyObject *)u;
4850 }
4851 if (u == unicode_empty) {
4852 Py_DECREF(u);
4853 return (PyObject *)v;
4854 }
4855
4856 /* Concat the two Unicode strings */
4857 w = _PyUnicode_New(u->length + v->length);
4858 if (w == NULL)
4859 goto onError;
4860 Py_UNICODE_COPY(w->str, u->str, u->length);
4861 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4862
4863 Py_DECREF(u);
4864 Py_DECREF(v);
4865 return (PyObject *)w;
4866
4867onError:
4868 Py_XDECREF(u);
4869 Py_XDECREF(v);
4870 return NULL;
4871}
4872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004873PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874"S.count(sub[, start[, end]]) -> int\n\
4875\n\
4876Return the number of occurrences of substring sub in Unicode string\n\
4877S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004878interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879
4880static PyObject *
4881unicode_count(PyUnicodeObject *self, PyObject *args)
4882{
4883 PyUnicodeObject *substring;
4884 int start = 0;
4885 int end = INT_MAX;
4886 PyObject *result;
4887
Guido van Rossumb8872e62000-05-09 14:14:27 +00004888 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4889 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890 return NULL;
4891
4892 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4893 (PyObject *)substring);
4894 if (substring == NULL)
4895 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004896
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897 if (start < 0)
4898 start += self->length;
4899 if (start < 0)
4900 start = 0;
4901 if (end > self->length)
4902 end = self->length;
4903 if (end < 0)
4904 end += self->length;
4905 if (end < 0)
4906 end = 0;
4907
4908 result = PyInt_FromLong((long) count(self, start, end, substring));
4909
4910 Py_DECREF(substring);
4911 return result;
4912}
4913
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004914PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004915"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004917Encodes S using the codec registered for encoding. encoding defaults\n\
4918to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004919handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004920a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4921'xmlcharrefreplace' as well as any other name registered with\n\
4922codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923
4924static PyObject *
4925unicode_encode(PyUnicodeObject *self, PyObject *args)
4926{
4927 char *encoding = NULL;
4928 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004929 PyObject *v;
4930
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4932 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004933 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00004934 if (v == NULL)
4935 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004936 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
4937 PyErr_Format(PyExc_TypeError,
4938 "encoder did not return a string/unicode object "
4939 "(type=%.400s)",
4940 v->ob_type->tp_name);
4941 Py_DECREF(v);
4942 return NULL;
4943 }
4944 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00004945
4946 onError:
4947 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004948}
4949
4950PyDoc_STRVAR(decode__doc__,
4951"S.decode([encoding[,errors]]) -> string or unicode\n\
4952\n\
4953Decodes S using the codec registered for encoding. encoding defaults\n\
4954to the default encoding. errors may be given to set a different error\n\
4955handling scheme. Default is 'strict' meaning that encoding errors raise\n\
4956a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
4957as well as any other name registerd with codecs.register_error that is\n\
4958able to handle UnicodeDecodeErrors.");
4959
4960static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00004961unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004962{
4963 char *encoding = NULL;
4964 char *errors = NULL;
4965 PyObject *v;
4966
4967 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
4968 return NULL;
4969 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00004970 if (v == NULL)
4971 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004972 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
4973 PyErr_Format(PyExc_TypeError,
4974 "decoder did not return a string/unicode object "
4975 "(type=%.400s)",
4976 v->ob_type->tp_name);
4977 Py_DECREF(v);
4978 return NULL;
4979 }
4980 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00004981
4982 onError:
4983 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984}
4985
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004986PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987"S.expandtabs([tabsize]) -> unicode\n\
4988\n\
4989Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004990If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991
4992static PyObject*
4993unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4994{
4995 Py_UNICODE *e;
4996 Py_UNICODE *p;
4997 Py_UNICODE *q;
4998 int i, j;
4999 PyUnicodeObject *u;
5000 int tabsize = 8;
5001
5002 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5003 return NULL;
5004
Thomas Wouters7e474022000-07-16 12:04:32 +00005005 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006 i = j = 0;
5007 e = self->str + self->length;
5008 for (p = self->str; p < e; p++)
5009 if (*p == '\t') {
5010 if (tabsize > 0)
5011 j += tabsize - (j % tabsize);
5012 }
5013 else {
5014 j++;
5015 if (*p == '\n' || *p == '\r') {
5016 i += j;
5017 j = 0;
5018 }
5019 }
5020
5021 /* Second pass: create output string and fill it */
5022 u = _PyUnicode_New(i + j);
5023 if (!u)
5024 return NULL;
5025
5026 j = 0;
5027 q = u->str;
5028
5029 for (p = self->str; p < e; p++)
5030 if (*p == '\t') {
5031 if (tabsize > 0) {
5032 i = tabsize - (j % tabsize);
5033 j += i;
5034 while (i--)
5035 *q++ = ' ';
5036 }
5037 }
5038 else {
5039 j++;
5040 *q++ = *p;
5041 if (*p == '\n' || *p == '\r')
5042 j = 0;
5043 }
5044
5045 return (PyObject*) u;
5046}
5047
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005048PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049"S.find(sub [,start [,end]]) -> int\n\
5050\n\
5051Return the lowest index in S where substring sub is found,\n\
5052such that sub is contained within s[start,end]. Optional\n\
5053arguments start and end are interpreted as in slice notation.\n\
5054\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005055Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005056
5057static PyObject *
5058unicode_find(PyUnicodeObject *self, PyObject *args)
5059{
5060 PyUnicodeObject *substring;
5061 int start = 0;
5062 int end = INT_MAX;
5063 PyObject *result;
5064
Guido van Rossumb8872e62000-05-09 14:14:27 +00005065 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5066 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067 return NULL;
5068 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5069 (PyObject *)substring);
5070 if (substring == NULL)
5071 return NULL;
5072
5073 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
5074
5075 Py_DECREF(substring);
5076 return result;
5077}
5078
5079static PyObject *
5080unicode_getitem(PyUnicodeObject *self, int index)
5081{
5082 if (index < 0 || index >= self->length) {
5083 PyErr_SetString(PyExc_IndexError, "string index out of range");
5084 return NULL;
5085 }
5086
5087 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5088}
5089
5090static long
5091unicode_hash(PyUnicodeObject *self)
5092{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005093 /* Since Unicode objects compare equal to their ASCII string
5094 counterparts, they should use the individual character values
5095 as basis for their hash value. This is needed to assure that
5096 strings and Unicode objects behave in the same way as
5097 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098
Fredrik Lundhdde61642000-07-10 18:27:47 +00005099 register int len;
5100 register Py_UNICODE *p;
5101 register long x;
5102
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103 if (self->hash != -1)
5104 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005105 len = PyUnicode_GET_SIZE(self);
5106 p = PyUnicode_AS_UNICODE(self);
5107 x = *p << 7;
5108 while (--len >= 0)
5109 x = (1000003*x) ^ *p++;
5110 x ^= PyUnicode_GET_SIZE(self);
5111 if (x == -1)
5112 x = -2;
5113 self->hash = x;
5114 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115}
5116
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005117PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118"S.index(sub [,start [,end]]) -> int\n\
5119\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005120Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121
5122static PyObject *
5123unicode_index(PyUnicodeObject *self, PyObject *args)
5124{
5125 int result;
5126 PyUnicodeObject *substring;
5127 int start = 0;
5128 int end = INT_MAX;
5129
Guido van Rossumb8872e62000-05-09 14:14:27 +00005130 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5131 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005133
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5135 (PyObject *)substring);
5136 if (substring == NULL)
5137 return NULL;
5138
5139 result = findstring(self, substring, start, end, 1);
5140
5141 Py_DECREF(substring);
5142 if (result < 0) {
5143 PyErr_SetString(PyExc_ValueError, "substring not found");
5144 return NULL;
5145 }
5146 return PyInt_FromLong(result);
5147}
5148
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005149PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005150"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005152Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005153at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154
5155static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005156unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157{
5158 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5159 register const Py_UNICODE *e;
5160 int cased;
5161
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162 /* Shortcut for single character strings */
5163 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005164 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005166 /* Special case for empty strings */
5167 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005168 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005169
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170 e = p + PyUnicode_GET_SIZE(self);
5171 cased = 0;
5172 for (; p < e; p++) {
5173 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005174
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005176 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177 else if (!cased && Py_UNICODE_ISLOWER(ch))
5178 cased = 1;
5179 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005180 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181}
5182
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005183PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005184"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005186Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005187at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188
5189static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005190unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191{
5192 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5193 register const Py_UNICODE *e;
5194 int cased;
5195
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196 /* Shortcut for single character strings */
5197 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005198 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005200 /* Special case for empty strings */
5201 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005202 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005203
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204 e = p + PyUnicode_GET_SIZE(self);
5205 cased = 0;
5206 for (; p < e; p++) {
5207 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005208
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005210 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211 else if (!cased && Py_UNICODE_ISUPPER(ch))
5212 cased = 1;
5213 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005214 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215}
5216
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005217PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005218"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005220Return True if S is a titlecased string and there is at least one\n\
5221character in S, i.e. upper- and titlecase characters may only\n\
5222follow uncased characters and lowercase characters only cased ones.\n\
5223Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224
5225static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005226unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227{
5228 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5229 register const Py_UNICODE *e;
5230 int cased, previous_is_cased;
5231
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232 /* Shortcut for single character strings */
5233 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005234 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5235 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005237 /* Special case for empty strings */
5238 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005239 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005240
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241 e = p + PyUnicode_GET_SIZE(self);
5242 cased = 0;
5243 previous_is_cased = 0;
5244 for (; p < e; p++) {
5245 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005246
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5248 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005249 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250 previous_is_cased = 1;
5251 cased = 1;
5252 }
5253 else if (Py_UNICODE_ISLOWER(ch)) {
5254 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005255 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256 previous_is_cased = 1;
5257 cased = 1;
5258 }
5259 else
5260 previous_is_cased = 0;
5261 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005262 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263}
5264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005265PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005266"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005268Return True if all characters in S are whitespace\n\
5269and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270
5271static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005272unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273{
5274 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5275 register const Py_UNICODE *e;
5276
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277 /* Shortcut for single character strings */
5278 if (PyUnicode_GET_SIZE(self) == 1 &&
5279 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005280 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005282 /* Special case for empty strings */
5283 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005284 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005285
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286 e = p + PyUnicode_GET_SIZE(self);
5287 for (; p < e; p++) {
5288 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005289 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005291 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292}
5293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005294PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005295"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005296\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005297Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005298and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005299
5300static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005301unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005302{
5303 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5304 register const Py_UNICODE *e;
5305
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005306 /* Shortcut for single character strings */
5307 if (PyUnicode_GET_SIZE(self) == 1 &&
5308 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005309 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005310
5311 /* Special case for empty strings */
5312 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005313 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005314
5315 e = p + PyUnicode_GET_SIZE(self);
5316 for (; p < e; p++) {
5317 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005318 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005319 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005320 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005321}
5322
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005323PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005324"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005325\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005326Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005327and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005328
5329static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005330unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005331{
5332 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5333 register const Py_UNICODE *e;
5334
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005335 /* Shortcut for single character strings */
5336 if (PyUnicode_GET_SIZE(self) == 1 &&
5337 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005338 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005339
5340 /* Special case for empty strings */
5341 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005342 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005343
5344 e = p + PyUnicode_GET_SIZE(self);
5345 for (; p < e; p++) {
5346 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005347 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005348 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005349 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005350}
5351
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005352PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005353"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005355Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005356False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357
5358static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005359unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360{
5361 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5362 register const Py_UNICODE *e;
5363
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364 /* Shortcut for single character strings */
5365 if (PyUnicode_GET_SIZE(self) == 1 &&
5366 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005367 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005369 /* Special case for empty strings */
5370 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005371 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005372
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373 e = p + PyUnicode_GET_SIZE(self);
5374 for (; p < e; p++) {
5375 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005376 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005378 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379}
5380
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005381PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005382"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005384Return True if all characters in S are digits\n\
5385and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386
5387static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005388unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389{
5390 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5391 register const Py_UNICODE *e;
5392
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393 /* Shortcut for single character strings */
5394 if (PyUnicode_GET_SIZE(self) == 1 &&
5395 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005396 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005398 /* Special case for empty strings */
5399 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005400 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005401
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 e = p + PyUnicode_GET_SIZE(self);
5403 for (; p < e; p++) {
5404 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005405 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005407 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408}
5409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005410PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005411"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005413Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005414False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415
5416static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005417unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418{
5419 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5420 register const Py_UNICODE *e;
5421
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 /* Shortcut for single character strings */
5423 if (PyUnicode_GET_SIZE(self) == 1 &&
5424 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005425 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005427 /* Special case for empty strings */
5428 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005429 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005430
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431 e = p + PyUnicode_GET_SIZE(self);
5432 for (; p < e; p++) {
5433 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005434 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005436 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437}
5438
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005439PyDoc_STRVAR(iswide__doc__,
5440"S.iswide() -> bool\n\
5441\n\
5442Return True if all characters in S are wide width\n\
5443and there is at least one character in S, False otherwise.");
5444
5445static PyObject*
5446unicode_iswide(PyUnicodeObject *self)
5447{
5448 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5449 register const Py_UNICODE *e;
5450
5451 /* Shortcut for single character strings */
5452 if (PyUnicode_GET_SIZE(self) == 1 &&
5453 Py_UNICODE_ISWIDE(*p))
5454 Py_RETURN_TRUE;
5455
5456 /* Special case for empty strings */
5457 if (PyString_GET_SIZE(self) == 0)
5458 Py_RETURN_FALSE;
5459
5460 e = p + PyUnicode_GET_SIZE(self);
5461 for (; p < e; p++) {
5462 if (!Py_UNICODE_ISWIDE(*p))
5463 Py_RETURN_FALSE;
5464 }
5465 Py_RETURN_TRUE;
5466}
5467
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005468PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469"S.join(sequence) -> unicode\n\
5470\n\
5471Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005472sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473
5474static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005475unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005477 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478}
5479
5480static int
5481unicode_length(PyUnicodeObject *self)
5482{
5483 return self->length;
5484}
5485
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005486PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005487"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488\n\
5489Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005490done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491
5492static PyObject *
5493unicode_ljust(PyUnicodeObject *self, PyObject *args)
5494{
5495 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005496 Py_UNICODE fillchar = ' ';
5497
5498 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499 return NULL;
5500
Tim Peters7a29bd52001-09-12 03:03:31 +00005501 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502 Py_INCREF(self);
5503 return (PyObject*) self;
5504 }
5505
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005506 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507}
5508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005509PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510"S.lower() -> unicode\n\
5511\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005512Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513
5514static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005515unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 return fixup(self, fixlower);
5518}
5519
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005520#define LEFTSTRIP 0
5521#define RIGHTSTRIP 1
5522#define BOTHSTRIP 2
5523
5524/* Arrays indexed by above */
5525static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5526
5527#define STRIPNAME(i) (stripformat[i]+3)
5528
5529static const Py_UNICODE *
5530unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5531{
Tim Peters030a5ce2002-04-22 19:00:10 +00005532 size_t i;
5533 for (i = 0; i < n; ++i)
5534 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005535 return s+i;
5536 return NULL;
5537}
5538
5539/* externally visible for str.strip(unicode) */
5540PyObject *
5541_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5542{
5543 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5544 int len = PyUnicode_GET_SIZE(self);
5545 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5546 int seplen = PyUnicode_GET_SIZE(sepobj);
5547 int i, j;
5548
5549 i = 0;
5550 if (striptype != RIGHTSTRIP) {
5551 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5552 i++;
5553 }
5554 }
5555
5556 j = len;
5557 if (striptype != LEFTSTRIP) {
5558 do {
5559 j--;
5560 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5561 j++;
5562 }
5563
5564 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5565 Py_INCREF(self);
5566 return (PyObject*)self;
5567 }
5568 else
5569 return PyUnicode_FromUnicode(s+i, j-i);
5570}
5571
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572
5573static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005574do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005576 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5577 int len = PyUnicode_GET_SIZE(self), i, j;
5578
5579 i = 0;
5580 if (striptype != RIGHTSTRIP) {
5581 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5582 i++;
5583 }
5584 }
5585
5586 j = len;
5587 if (striptype != LEFTSTRIP) {
5588 do {
5589 j--;
5590 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5591 j++;
5592 }
5593
5594 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5595 Py_INCREF(self);
5596 return (PyObject*)self;
5597 }
5598 else
5599 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600}
5601
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005602
5603static PyObject *
5604do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5605{
5606 PyObject *sep = NULL;
5607
5608 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5609 return NULL;
5610
5611 if (sep != NULL && sep != Py_None) {
5612 if (PyUnicode_Check(sep))
5613 return _PyUnicode_XStrip(self, striptype, sep);
5614 else if (PyString_Check(sep)) {
5615 PyObject *res;
5616 sep = PyUnicode_FromObject(sep);
5617 if (sep==NULL)
5618 return NULL;
5619 res = _PyUnicode_XStrip(self, striptype, sep);
5620 Py_DECREF(sep);
5621 return res;
5622 }
5623 else {
5624 PyErr_Format(PyExc_TypeError,
5625 "%s arg must be None, unicode or str",
5626 STRIPNAME(striptype));
5627 return NULL;
5628 }
5629 }
5630
5631 return do_strip(self, striptype);
5632}
5633
5634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005635PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005636"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005637\n\
5638Return a copy of the string S with leading and trailing\n\
5639whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005640If chars is given and not None, remove characters in chars instead.\n\
5641If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005642
5643static PyObject *
5644unicode_strip(PyUnicodeObject *self, PyObject *args)
5645{
5646 if (PyTuple_GET_SIZE(args) == 0)
5647 return do_strip(self, BOTHSTRIP); /* Common case */
5648 else
5649 return do_argstrip(self, BOTHSTRIP, args);
5650}
5651
5652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005653PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005654"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005655\n\
5656Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005657If chars is given and not None, remove characters in chars instead.\n\
5658If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005659
5660static PyObject *
5661unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5662{
5663 if (PyTuple_GET_SIZE(args) == 0)
5664 return do_strip(self, LEFTSTRIP); /* Common case */
5665 else
5666 return do_argstrip(self, LEFTSTRIP, args);
5667}
5668
5669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005670PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005671"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005672\n\
5673Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005674If chars is given and not None, remove characters in chars instead.\n\
5675If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005676
5677static PyObject *
5678unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5679{
5680 if (PyTuple_GET_SIZE(args) == 0)
5681 return do_strip(self, RIGHTSTRIP); /* Common case */
5682 else
5683 return do_argstrip(self, RIGHTSTRIP, args);
5684}
5685
5686
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687static PyObject*
5688unicode_repeat(PyUnicodeObject *str, int len)
5689{
5690 PyUnicodeObject *u;
5691 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005692 int nchars;
5693 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694
5695 if (len < 0)
5696 len = 0;
5697
Tim Peters7a29bd52001-09-12 03:03:31 +00005698 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 /* no repeat, return original string */
5700 Py_INCREF(str);
5701 return (PyObject*) str;
5702 }
Tim Peters8f422462000-09-09 06:13:41 +00005703
5704 /* ensure # of chars needed doesn't overflow int and # of bytes
5705 * needed doesn't overflow size_t
5706 */
5707 nchars = len * str->length;
5708 if (len && nchars / len != str->length) {
5709 PyErr_SetString(PyExc_OverflowError,
5710 "repeated string is too long");
5711 return NULL;
5712 }
5713 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5714 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5715 PyErr_SetString(PyExc_OverflowError,
5716 "repeated string is too long");
5717 return NULL;
5718 }
5719 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 if (!u)
5721 return NULL;
5722
5723 p = u->str;
5724
5725 while (len-- > 0) {
5726 Py_UNICODE_COPY(p, str->str, str->length);
5727 p += str->length;
5728 }
5729
5730 return (PyObject*) u;
5731}
5732
5733PyObject *PyUnicode_Replace(PyObject *obj,
5734 PyObject *subobj,
5735 PyObject *replobj,
5736 int maxcount)
5737{
5738 PyObject *self;
5739 PyObject *str1;
5740 PyObject *str2;
5741 PyObject *result;
5742
5743 self = PyUnicode_FromObject(obj);
5744 if (self == NULL)
5745 return NULL;
5746 str1 = PyUnicode_FromObject(subobj);
5747 if (str1 == NULL) {
5748 Py_DECREF(self);
5749 return NULL;
5750 }
5751 str2 = PyUnicode_FromObject(replobj);
5752 if (str2 == NULL) {
5753 Py_DECREF(self);
5754 Py_DECREF(str1);
5755 return NULL;
5756 }
Tim Petersced69f82003-09-16 20:30:58 +00005757 result = replace((PyUnicodeObject *)self,
5758 (PyUnicodeObject *)str1,
5759 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 maxcount);
5761 Py_DECREF(self);
5762 Py_DECREF(str1);
5763 Py_DECREF(str2);
5764 return result;
5765}
5766
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005767PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768"S.replace (old, new[, maxsplit]) -> unicode\n\
5769\n\
5770Return a copy of S with all occurrences of substring\n\
5771old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005772given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773
5774static PyObject*
5775unicode_replace(PyUnicodeObject *self, PyObject *args)
5776{
5777 PyUnicodeObject *str1;
5778 PyUnicodeObject *str2;
5779 int maxcount = -1;
5780 PyObject *result;
5781
5782 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5783 return NULL;
5784 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5785 if (str1 == NULL)
5786 return NULL;
5787 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005788 if (str2 == NULL) {
5789 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792
5793 result = replace(self, str1, str2, maxcount);
5794
5795 Py_DECREF(str1);
5796 Py_DECREF(str2);
5797 return result;
5798}
5799
5800static
5801PyObject *unicode_repr(PyObject *unicode)
5802{
5803 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5804 PyUnicode_GET_SIZE(unicode),
5805 1);
5806}
5807
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005808PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809"S.rfind(sub [,start [,end]]) -> int\n\
5810\n\
5811Return the highest index in S where substring sub is found,\n\
5812such that sub is contained within s[start,end]. Optional\n\
5813arguments start and end are interpreted as in slice notation.\n\
5814\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005815Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816
5817static PyObject *
5818unicode_rfind(PyUnicodeObject *self, PyObject *args)
5819{
5820 PyUnicodeObject *substring;
5821 int start = 0;
5822 int end = INT_MAX;
5823 PyObject *result;
5824
Guido van Rossumb8872e62000-05-09 14:14:27 +00005825 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5826 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827 return NULL;
5828 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5829 (PyObject *)substring);
5830 if (substring == NULL)
5831 return NULL;
5832
5833 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5834
5835 Py_DECREF(substring);
5836 return result;
5837}
5838
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005839PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840"S.rindex(sub [,start [,end]]) -> int\n\
5841\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005842Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843
5844static PyObject *
5845unicode_rindex(PyUnicodeObject *self, PyObject *args)
5846{
5847 int result;
5848 PyUnicodeObject *substring;
5849 int start = 0;
5850 int end = INT_MAX;
5851
Guido van Rossumb8872e62000-05-09 14:14:27 +00005852 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5853 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 return NULL;
5855 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5856 (PyObject *)substring);
5857 if (substring == NULL)
5858 return NULL;
5859
5860 result = findstring(self, substring, start, end, -1);
5861
5862 Py_DECREF(substring);
5863 if (result < 0) {
5864 PyErr_SetString(PyExc_ValueError, "substring not found");
5865 return NULL;
5866 }
5867 return PyInt_FromLong(result);
5868}
5869
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005870PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005871"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872\n\
5873Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005874done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875
5876static PyObject *
5877unicode_rjust(PyUnicodeObject *self, PyObject *args)
5878{
5879 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005880 Py_UNICODE fillchar = ' ';
5881
5882 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883 return NULL;
5884
Tim Peters7a29bd52001-09-12 03:03:31 +00005885 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886 Py_INCREF(self);
5887 return (PyObject*) self;
5888 }
5889
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005890 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891}
5892
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893static PyObject*
5894unicode_slice(PyUnicodeObject *self, int start, int end)
5895{
5896 /* standard clamping */
5897 if (start < 0)
5898 start = 0;
5899 if (end < 0)
5900 end = 0;
5901 if (end > self->length)
5902 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005903 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 /* full slice, return original string */
5905 Py_INCREF(self);
5906 return (PyObject*) self;
5907 }
5908 if (start > end)
5909 start = end;
5910 /* copy slice */
5911 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5912 end - start);
5913}
5914
5915PyObject *PyUnicode_Split(PyObject *s,
5916 PyObject *sep,
5917 int maxsplit)
5918{
5919 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005920
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921 s = PyUnicode_FromObject(s);
5922 if (s == NULL)
5923 return NULL;
5924 if (sep != NULL) {
5925 sep = PyUnicode_FromObject(sep);
5926 if (sep == NULL) {
5927 Py_DECREF(s);
5928 return NULL;
5929 }
5930 }
5931
5932 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5933
5934 Py_DECREF(s);
5935 Py_XDECREF(sep);
5936 return result;
5937}
5938
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005939PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940"S.split([sep [,maxsplit]]) -> list of strings\n\
5941\n\
5942Return a list of the words in S, using sep as the\n\
5943delimiter string. If maxsplit is given, at most maxsplit\n\
5944splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005945is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946
5947static PyObject*
5948unicode_split(PyUnicodeObject *self, PyObject *args)
5949{
5950 PyObject *substring = Py_None;
5951 int maxcount = -1;
5952
5953 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5954 return NULL;
5955
5956 if (substring == Py_None)
5957 return split(self, NULL, maxcount);
5958 else if (PyUnicode_Check(substring))
5959 return split(self, (PyUnicodeObject *)substring, maxcount);
5960 else
5961 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5962}
5963
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005964PyObject *PyUnicode_RSplit(PyObject *s,
5965 PyObject *sep,
5966 int maxsplit)
5967{
5968 PyObject *result;
5969
5970 s = PyUnicode_FromObject(s);
5971 if (s == NULL)
5972 return NULL;
5973 if (sep != NULL) {
5974 sep = PyUnicode_FromObject(sep);
5975 if (sep == NULL) {
5976 Py_DECREF(s);
5977 return NULL;
5978 }
5979 }
5980
5981 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5982
5983 Py_DECREF(s);
5984 Py_XDECREF(sep);
5985 return result;
5986}
5987
5988PyDoc_STRVAR(rsplit__doc__,
5989"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
5990\n\
5991Return a list of the words in S, using sep as the\n\
5992delimiter string, starting at the end of the string and\n\
5993working to the front. If maxsplit is given, at most maxsplit\n\
5994splits are done. If sep is not specified, any whitespace string\n\
5995is a separator.");
5996
5997static PyObject*
5998unicode_rsplit(PyUnicodeObject *self, PyObject *args)
5999{
6000 PyObject *substring = Py_None;
6001 int maxcount = -1;
6002
6003 if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
6004 return NULL;
6005
6006 if (substring == Py_None)
6007 return rsplit(self, NULL, maxcount);
6008 else if (PyUnicode_Check(substring))
6009 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6010 else
6011 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6012}
6013
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006014PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006015"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016\n\
6017Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006018Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006019is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020
6021static PyObject*
6022unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6023{
Guido van Rossum86662912000-04-11 15:38:46 +00006024 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025
Guido van Rossum86662912000-04-11 15:38:46 +00006026 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 return NULL;
6028
Guido van Rossum86662912000-04-11 15:38:46 +00006029 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030}
6031
6032static
6033PyObject *unicode_str(PyUnicodeObject *self)
6034{
Fred Drakee4315f52000-05-09 19:53:39 +00006035 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036}
6037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006038PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039"S.swapcase() -> unicode\n\
6040\n\
6041Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006042and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043
6044static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006045unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 return fixup(self, fixswapcase);
6048}
6049
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006050PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051"S.translate(table) -> unicode\n\
6052\n\
6053Return a copy of the string S, where all characters have been mapped\n\
6054through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006055Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6056Unmapped characters are left untouched. Characters mapped to None\n\
6057are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058
6059static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006060unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061{
Tim Petersced69f82003-09-16 20:30:58 +00006062 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006064 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065 "ignore");
6066}
6067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006068PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069"S.upper() -> unicode\n\
6070\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006071Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072
6073static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006074unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 return fixup(self, fixupper);
6077}
6078
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006079PyDoc_STRVAR(width__doc__,
6080"S.width() -> unicode\n\
6081\n\
6082Return a fixed-width representation length of S.");
6083
6084static PyObject*
6085unicode_width(PyObject *self)
6086{
6087 int width = PyUnicode_GetWidth(self);
6088 if (width == -1)
6089 return NULL;
6090 else
6091 return PyInt_FromLong((long)width);
6092}
6093
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006094PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095"S.zfill(width) -> unicode\n\
6096\n\
6097Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006098of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099
6100static PyObject *
6101unicode_zfill(PyUnicodeObject *self, PyObject *args)
6102{
6103 int fill;
6104 PyUnicodeObject *u;
6105
6106 int width;
6107 if (!PyArg_ParseTuple(args, "i:zfill", &width))
6108 return NULL;
6109
6110 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006111 if (PyUnicode_CheckExact(self)) {
6112 Py_INCREF(self);
6113 return (PyObject*) self;
6114 }
6115 else
6116 return PyUnicode_FromUnicode(
6117 PyUnicode_AS_UNICODE(self),
6118 PyUnicode_GET_SIZE(self)
6119 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 }
6121
6122 fill = width - self->length;
6123
6124 u = pad(self, fill, 0, '0');
6125
Walter Dörwald068325e2002-04-15 13:36:47 +00006126 if (u == NULL)
6127 return NULL;
6128
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 if (u->str[fill] == '+' || u->str[fill] == '-') {
6130 /* move sign to beginning of string */
6131 u->str[0] = u->str[fill];
6132 u->str[fill] = '0';
6133 }
6134
6135 return (PyObject*) u;
6136}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137
6138#if 0
6139static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006140unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 return PyInt_FromLong(unicode_freelist_size);
6143}
6144#endif
6145
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006146PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006147"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006149Return True if S starts with the specified prefix, False otherwise.\n\
6150With optional start, test S beginning at that position.\n\
6151With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152
6153static PyObject *
6154unicode_startswith(PyUnicodeObject *self,
6155 PyObject *args)
6156{
6157 PyUnicodeObject *substring;
6158 int start = 0;
6159 int end = INT_MAX;
6160 PyObject *result;
6161
Guido van Rossumb8872e62000-05-09 14:14:27 +00006162 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6163 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 return NULL;
6165 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6166 (PyObject *)substring);
6167 if (substring == NULL)
6168 return NULL;
6169
Guido van Rossum77f6a652002-04-03 22:41:51 +00006170 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171
6172 Py_DECREF(substring);
6173 return result;
6174}
6175
6176
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006177PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006178"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006180Return True if S ends with the specified suffix, False otherwise.\n\
6181With optional start, test S beginning at that position.\n\
6182With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183
6184static PyObject *
6185unicode_endswith(PyUnicodeObject *self,
6186 PyObject *args)
6187{
6188 PyUnicodeObject *substring;
6189 int start = 0;
6190 int end = INT_MAX;
6191 PyObject *result;
6192
Guido van Rossumb8872e62000-05-09 14:14:27 +00006193 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6194 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195 return NULL;
6196 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6197 (PyObject *)substring);
6198 if (substring == NULL)
6199 return NULL;
6200
Guido van Rossum77f6a652002-04-03 22:41:51 +00006201 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202
6203 Py_DECREF(substring);
6204 return result;
6205}
6206
6207
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006208
6209static PyObject *
6210unicode_getnewargs(PyUnicodeObject *v)
6211{
6212 return Py_BuildValue("(u#)", v->str, v->length);
6213}
6214
6215
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216static PyMethodDef unicode_methods[] = {
6217
6218 /* Order is according to common usage: often used methods should
6219 appear first, since lookup is done sequentially. */
6220
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006221 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6222 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6223 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006224 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006225 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6226 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6227 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6228 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6229 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6230 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6231 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6232 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6233 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6234 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006235 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006236 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006237/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6238 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6239 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6240 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006241 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006242 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006243 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006244 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6245 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6246 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6247 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6248 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6249 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6250 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6251 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6252 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6253 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6254 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6255 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6256 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6257 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006258 {"iswide", (PyCFunction) unicode_iswide, METH_NOARGS, iswide__doc__},
6259 {"width", (PyCFunction) unicode_width, METH_NOARGS, width__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006260 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006261#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006262 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263#endif
6264
6265#if 0
6266 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006267 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268#endif
6269
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006270 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 {NULL, NULL}
6272};
6273
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006274static PyObject *
6275unicode_mod(PyObject *v, PyObject *w)
6276{
6277 if (!PyUnicode_Check(v)) {
6278 Py_INCREF(Py_NotImplemented);
6279 return Py_NotImplemented;
6280 }
6281 return PyUnicode_Format(v, w);
6282}
6283
6284static PyNumberMethods unicode_as_number = {
6285 0, /*nb_add*/
6286 0, /*nb_subtract*/
6287 0, /*nb_multiply*/
6288 0, /*nb_divide*/
6289 unicode_mod, /*nb_remainder*/
6290};
6291
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292static PySequenceMethods unicode_as_sequence = {
6293 (inquiry) unicode_length, /* sq_length */
6294 (binaryfunc) PyUnicode_Concat, /* sq_concat */
6295 (intargfunc) unicode_repeat, /* sq_repeat */
6296 (intargfunc) unicode_getitem, /* sq_item */
6297 (intintargfunc) unicode_slice, /* sq_slice */
6298 0, /* sq_ass_item */
6299 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00006300 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301};
6302
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006303static PyObject*
6304unicode_subscript(PyUnicodeObject* self, PyObject* item)
6305{
6306 if (PyInt_Check(item)) {
6307 long i = PyInt_AS_LONG(item);
6308 if (i < 0)
6309 i += PyString_GET_SIZE(self);
6310 return unicode_getitem(self, i);
6311 } else if (PyLong_Check(item)) {
6312 long i = PyLong_AsLong(item);
6313 if (i == -1 && PyErr_Occurred())
6314 return NULL;
6315 if (i < 0)
6316 i += PyString_GET_SIZE(self);
6317 return unicode_getitem(self, i);
6318 } else if (PySlice_Check(item)) {
6319 int start, stop, step, slicelength, cur, i;
6320 Py_UNICODE* source_buf;
6321 Py_UNICODE* result_buf;
6322 PyObject* result;
6323
6324 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
6325 &start, &stop, &step, &slicelength) < 0) {
6326 return NULL;
6327 }
6328
6329 if (slicelength <= 0) {
6330 return PyUnicode_FromUnicode(NULL, 0);
6331 } else {
6332 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6333 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
6334
6335 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6336 result_buf[i] = source_buf[cur];
6337 }
Tim Petersced69f82003-09-16 20:30:58 +00006338
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006339 result = PyUnicode_FromUnicode(result_buf, slicelength);
6340 PyMem_FREE(result_buf);
6341 return result;
6342 }
6343 } else {
6344 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6345 return NULL;
6346 }
6347}
6348
6349static PyMappingMethods unicode_as_mapping = {
6350 (inquiry)unicode_length, /* mp_length */
6351 (binaryfunc)unicode_subscript, /* mp_subscript */
6352 (objobjargproc)0, /* mp_ass_subscript */
6353};
6354
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355static int
6356unicode_buffer_getreadbuf(PyUnicodeObject *self,
6357 int index,
6358 const void **ptr)
6359{
6360 if (index != 0) {
6361 PyErr_SetString(PyExc_SystemError,
6362 "accessing non-existent unicode segment");
6363 return -1;
6364 }
6365 *ptr = (void *) self->str;
6366 return PyUnicode_GET_DATA_SIZE(self);
6367}
6368
6369static int
6370unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6371 const void **ptr)
6372{
6373 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006374 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375 return -1;
6376}
6377
6378static int
6379unicode_buffer_getsegcount(PyUnicodeObject *self,
6380 int *lenp)
6381{
6382 if (lenp)
6383 *lenp = PyUnicode_GET_DATA_SIZE(self);
6384 return 1;
6385}
6386
6387static int
6388unicode_buffer_getcharbuf(PyUnicodeObject *self,
6389 int index,
6390 const void **ptr)
6391{
6392 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006393
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394 if (index != 0) {
6395 PyErr_SetString(PyExc_SystemError,
6396 "accessing non-existent unicode segment");
6397 return -1;
6398 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006399 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400 if (str == NULL)
6401 return -1;
6402 *ptr = (void *) PyString_AS_STRING(str);
6403 return PyString_GET_SIZE(str);
6404}
6405
6406/* Helpers for PyUnicode_Format() */
6407
6408static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006409getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410{
6411 int argidx = *p_argidx;
6412 if (argidx < arglen) {
6413 (*p_argidx)++;
6414 if (arglen < 0)
6415 return args;
6416 else
6417 return PyTuple_GetItem(args, argidx);
6418 }
6419 PyErr_SetString(PyExc_TypeError,
6420 "not enough arguments for format string");
6421 return NULL;
6422}
6423
6424#define F_LJUST (1<<0)
6425#define F_SIGN (1<<1)
6426#define F_BLANK (1<<2)
6427#define F_ALT (1<<3)
6428#define F_ZERO (1<<4)
6429
6430static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432{
6433 register int i;
6434 int len;
6435 va_list va;
6436 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438
6439 /* First, format the string as char array, then expand to Py_UNICODE
6440 array. */
6441 charbuffer = (char *)buffer;
6442 len = vsprintf(charbuffer, format, va);
6443 for (i = len - 1; i >= 0; i--)
6444 buffer[i] = (Py_UNICODE) charbuffer[i];
6445
6446 va_end(va);
6447 return len;
6448}
6449
Guido van Rossum078151d2002-08-11 04:24:12 +00006450/* XXX To save some code duplication, formatfloat/long/int could have been
6451 shared with stringobject.c, converting from 8-bit to Unicode after the
6452 formatting is done. */
6453
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454static int
6455formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006456 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 int flags,
6458 int prec,
6459 int type,
6460 PyObject *v)
6461{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006462 /* fmt = '%#.' + `prec` + `type`
6463 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464 char fmt[20];
6465 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006466
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 x = PyFloat_AsDouble(v);
6468 if (x == -1.0 && PyErr_Occurred())
6469 return -1;
6470 if (prec < 0)
6471 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6473 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006474 /* Worst case length calc to ensure no buffer overrun:
6475
6476 'g' formats:
6477 fmt = %#.<prec>g
6478 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6479 for any double rep.)
6480 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6481
6482 'f' formats:
6483 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6484 len = 1 + 50 + 1 + prec = 52 + prec
6485
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006486 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006487 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006488
6489 */
6490 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6491 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006492 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006493 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006494 return -1;
6495 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006496 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6497 (flags&F_ALT) ? "#" : "",
6498 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 return usprintf(buf, fmt, x);
6500}
6501
Tim Peters38fd5b62000-09-21 05:43:11 +00006502static PyObject*
6503formatlong(PyObject *val, int flags, int prec, int type)
6504{
6505 char *buf;
6506 int i, len;
6507 PyObject *str; /* temporary string object. */
6508 PyUnicodeObject *result;
6509
6510 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6511 if (!str)
6512 return NULL;
6513 result = _PyUnicode_New(len);
6514 for (i = 0; i < len; i++)
6515 result->str[i] = buf[i];
6516 result->str[len] = 0;
6517 Py_DECREF(str);
6518 return (PyObject*)result;
6519}
6520
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521static int
6522formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006523 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524 int flags,
6525 int prec,
6526 int type,
6527 PyObject *v)
6528{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006529 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006530 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6531 * + 1 + 1
6532 * = 24
6533 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006534 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006535 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536 long x;
6537
6538 x = PyInt_AsLong(v);
6539 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006540 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006541 if (x < 0 && type == 'u') {
6542 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006543 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006544 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6545 sign = "-";
6546 else
6547 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006549 prec = 1;
6550
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006551 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6552 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006553 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006554 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006555 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006556 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006557 return -1;
6558 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006559
6560 if ((flags & F_ALT) &&
6561 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006562 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006563 * of issues that cause pain:
6564 * - when 0 is being converted, the C standard leaves off
6565 * the '0x' or '0X', which is inconsistent with other
6566 * %#x/%#X conversions and inconsistent with Python's
6567 * hex() function
6568 * - there are platforms that violate the standard and
6569 * convert 0 with the '0x' or '0X'
6570 * (Metrowerks, Compaq Tru64)
6571 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006572 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006573 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006574 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006575 * We can achieve the desired consistency by inserting our
6576 * own '0x' or '0X' prefix, and substituting %x/%X in place
6577 * of %#x/%#X.
6578 *
6579 * Note that this is the same approach as used in
6580 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006581 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006582 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6583 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006584 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006585 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006586 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6587 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006588 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006589 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006590 if (sign[0])
6591 return usprintf(buf, fmt, -x);
6592 else
6593 return usprintf(buf, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594}
6595
6596static int
6597formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006598 size_t buflen,
6599 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006601 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006602 if (PyUnicode_Check(v)) {
6603 if (PyUnicode_GET_SIZE(v) != 1)
6604 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006606 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006608 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006609 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006610 goto onError;
6611 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6612 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613
6614 else {
6615 /* Integer input truncated to a character */
6616 long x;
6617 x = PyInt_AsLong(v);
6618 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006619 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006620#ifdef Py_UNICODE_WIDE
6621 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006622 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006623 "%c arg not in range(0x110000) "
6624 "(wide Python build)");
6625 return -1;
6626 }
6627#else
6628 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006629 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006630 "%c arg not in range(0x10000) "
6631 "(narrow Python build)");
6632 return -1;
6633 }
6634#endif
6635 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636 }
6637 buf[1] = '\0';
6638 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006639
6640 onError:
6641 PyErr_SetString(PyExc_TypeError,
6642 "%c requires int or char");
6643 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644}
6645
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006646/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6647
6648 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6649 chars are formatted. XXX This is a magic number. Each formatting
6650 routine does bounds checking to ensure no overflow, but a better
6651 solution may be to malloc a buffer of appropriate size for each
6652 format. For now, the current solution is sufficient.
6653*/
6654#define FORMATBUFLEN (size_t)120
6655
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656PyObject *PyUnicode_Format(PyObject *format,
6657 PyObject *args)
6658{
6659 Py_UNICODE *fmt, *res;
6660 int fmtcnt, rescnt, reslen, arglen, argidx;
6661 int args_owned = 0;
6662 PyUnicodeObject *result = NULL;
6663 PyObject *dict = NULL;
6664 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006665
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666 if (format == NULL || args == NULL) {
6667 PyErr_BadInternalCall();
6668 return NULL;
6669 }
6670 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006671 if (uformat == NULL)
6672 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673 fmt = PyUnicode_AS_UNICODE(uformat);
6674 fmtcnt = PyUnicode_GET_SIZE(uformat);
6675
6676 reslen = rescnt = fmtcnt + 100;
6677 result = _PyUnicode_New(reslen);
6678 if (result == NULL)
6679 goto onError;
6680 res = PyUnicode_AS_UNICODE(result);
6681
6682 if (PyTuple_Check(args)) {
6683 arglen = PyTuple_Size(args);
6684 argidx = 0;
6685 }
6686 else {
6687 arglen = -1;
6688 argidx = -2;
6689 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006690 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6691 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692 dict = args;
6693
6694 while (--fmtcnt >= 0) {
6695 if (*fmt != '%') {
6696 if (--rescnt < 0) {
6697 rescnt = fmtcnt + 100;
6698 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006699 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700 return NULL;
6701 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6702 --rescnt;
6703 }
6704 *res++ = *fmt++;
6705 }
6706 else {
6707 /* Got a format specifier */
6708 int flags = 0;
6709 int width = -1;
6710 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711 Py_UNICODE c = '\0';
6712 Py_UNICODE fill;
6713 PyObject *v = NULL;
6714 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006715 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716 Py_UNICODE sign;
6717 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006718 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719
6720 fmt++;
6721 if (*fmt == '(') {
6722 Py_UNICODE *keystart;
6723 int keylen;
6724 PyObject *key;
6725 int pcount = 1;
6726
6727 if (dict == NULL) {
6728 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006729 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730 goto onError;
6731 }
6732 ++fmt;
6733 --fmtcnt;
6734 keystart = fmt;
6735 /* Skip over balanced parentheses */
6736 while (pcount > 0 && --fmtcnt >= 0) {
6737 if (*fmt == ')')
6738 --pcount;
6739 else if (*fmt == '(')
6740 ++pcount;
6741 fmt++;
6742 }
6743 keylen = fmt - keystart - 1;
6744 if (fmtcnt < 0 || pcount > 0) {
6745 PyErr_SetString(PyExc_ValueError,
6746 "incomplete format key");
6747 goto onError;
6748 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006749#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006750 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 then looked up since Python uses strings to hold
6752 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006753 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754 key = PyUnicode_EncodeUTF8(keystart,
6755 keylen,
6756 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006757#else
6758 key = PyUnicode_FromUnicode(keystart, keylen);
6759#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760 if (key == NULL)
6761 goto onError;
6762 if (args_owned) {
6763 Py_DECREF(args);
6764 args_owned = 0;
6765 }
6766 args = PyObject_GetItem(dict, key);
6767 Py_DECREF(key);
6768 if (args == NULL) {
6769 goto onError;
6770 }
6771 args_owned = 1;
6772 arglen = -1;
6773 argidx = -2;
6774 }
6775 while (--fmtcnt >= 0) {
6776 switch (c = *fmt++) {
6777 case '-': flags |= F_LJUST; continue;
6778 case '+': flags |= F_SIGN; continue;
6779 case ' ': flags |= F_BLANK; continue;
6780 case '#': flags |= F_ALT; continue;
6781 case '0': flags |= F_ZERO; continue;
6782 }
6783 break;
6784 }
6785 if (c == '*') {
6786 v = getnextarg(args, arglen, &argidx);
6787 if (v == NULL)
6788 goto onError;
6789 if (!PyInt_Check(v)) {
6790 PyErr_SetString(PyExc_TypeError,
6791 "* wants int");
6792 goto onError;
6793 }
6794 width = PyInt_AsLong(v);
6795 if (width < 0) {
6796 flags |= F_LJUST;
6797 width = -width;
6798 }
6799 if (--fmtcnt >= 0)
6800 c = *fmt++;
6801 }
6802 else if (c >= '0' && c <= '9') {
6803 width = c - '0';
6804 while (--fmtcnt >= 0) {
6805 c = *fmt++;
6806 if (c < '0' || c > '9')
6807 break;
6808 if ((width*10) / 10 != width) {
6809 PyErr_SetString(PyExc_ValueError,
6810 "width too big");
6811 goto onError;
6812 }
6813 width = width*10 + (c - '0');
6814 }
6815 }
6816 if (c == '.') {
6817 prec = 0;
6818 if (--fmtcnt >= 0)
6819 c = *fmt++;
6820 if (c == '*') {
6821 v = getnextarg(args, arglen, &argidx);
6822 if (v == NULL)
6823 goto onError;
6824 if (!PyInt_Check(v)) {
6825 PyErr_SetString(PyExc_TypeError,
6826 "* wants int");
6827 goto onError;
6828 }
6829 prec = PyInt_AsLong(v);
6830 if (prec < 0)
6831 prec = 0;
6832 if (--fmtcnt >= 0)
6833 c = *fmt++;
6834 }
6835 else if (c >= '0' && c <= '9') {
6836 prec = c - '0';
6837 while (--fmtcnt >= 0) {
6838 c = Py_CHARMASK(*fmt++);
6839 if (c < '0' || c > '9')
6840 break;
6841 if ((prec*10) / 10 != prec) {
6842 PyErr_SetString(PyExc_ValueError,
6843 "prec too big");
6844 goto onError;
6845 }
6846 prec = prec*10 + (c - '0');
6847 }
6848 }
6849 } /* prec */
6850 if (fmtcnt >= 0) {
6851 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852 if (--fmtcnt >= 0)
6853 c = *fmt++;
6854 }
6855 }
6856 if (fmtcnt < 0) {
6857 PyErr_SetString(PyExc_ValueError,
6858 "incomplete format");
6859 goto onError;
6860 }
6861 if (c != '%') {
6862 v = getnextarg(args, arglen, &argidx);
6863 if (v == NULL)
6864 goto onError;
6865 }
6866 sign = 0;
6867 fill = ' ';
6868 switch (c) {
6869
6870 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006871 pbuf = formatbuf;
6872 /* presume that buffer length is at least 1 */
6873 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874 len = 1;
6875 break;
6876
6877 case 's':
6878 case 'r':
6879 if (PyUnicode_Check(v) && c == 's') {
6880 temp = v;
6881 Py_INCREF(temp);
6882 }
6883 else {
6884 PyObject *unicode;
6885 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006886 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 else
6888 temp = PyObject_Repr(v);
6889 if (temp == NULL)
6890 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006891 if (PyUnicode_Check(temp))
6892 /* nothing to do */;
6893 else if (PyString_Check(temp)) {
6894 /* convert to string to Unicode */
Fred Drakee4315f52000-05-09 19:53:39 +00006895 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006897 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 "strict");
6899 Py_DECREF(temp);
6900 temp = unicode;
6901 if (temp == NULL)
6902 goto onError;
6903 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006904 else {
6905 Py_DECREF(temp);
6906 PyErr_SetString(PyExc_TypeError,
6907 "%s argument has non-string str()");
6908 goto onError;
6909 }
6910 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006911 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912 len = PyUnicode_GET_SIZE(temp);
6913 if (prec >= 0 && len > prec)
6914 len = prec;
6915 break;
6916
6917 case 'i':
6918 case 'd':
6919 case 'u':
6920 case 'o':
6921 case 'x':
6922 case 'X':
6923 if (c == 'i')
6924 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006925 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006926 temp = formatlong(v, flags, prec, c);
6927 if (!temp)
6928 goto onError;
6929 pbuf = PyUnicode_AS_UNICODE(temp);
6930 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00006931 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006933 else {
6934 pbuf = formatbuf;
6935 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6936 flags, prec, c, v);
6937 if (len < 0)
6938 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006939 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006940 }
6941 if (flags & F_ZERO)
6942 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943 break;
6944
6945 case 'e':
6946 case 'E':
6947 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006948 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949 case 'g':
6950 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006951 if (c == 'F')
6952 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006953 pbuf = formatbuf;
6954 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6955 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 if (len < 0)
6957 goto onError;
6958 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006959 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960 fill = '0';
6961 break;
6962
6963 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006964 pbuf = formatbuf;
6965 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966 if (len < 0)
6967 goto onError;
6968 break;
6969
6970 default:
6971 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006972 "unsupported format character '%c' (0x%x) "
6973 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00006974 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006975 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006976 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977 goto onError;
6978 }
6979 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006980 if (*pbuf == '-' || *pbuf == '+') {
6981 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982 len--;
6983 }
6984 else if (flags & F_SIGN)
6985 sign = '+';
6986 else if (flags & F_BLANK)
6987 sign = ' ';
6988 else
6989 sign = 0;
6990 }
6991 if (width < len)
6992 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006993 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994 reslen -= rescnt;
6995 rescnt = width + fmtcnt + 100;
6996 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006997 if (reslen < 0) {
6998 Py_DECREF(result);
6999 return PyErr_NoMemory();
7000 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007001 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 return NULL;
7003 res = PyUnicode_AS_UNICODE(result)
7004 + reslen - rescnt;
7005 }
7006 if (sign) {
7007 if (fill != ' ')
7008 *res++ = sign;
7009 rescnt--;
7010 if (width > len)
7011 width--;
7012 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007013 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7014 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007015 assert(pbuf[1] == c);
7016 if (fill != ' ') {
7017 *res++ = *pbuf++;
7018 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007019 }
Tim Petersfff53252001-04-12 18:38:48 +00007020 rescnt -= 2;
7021 width -= 2;
7022 if (width < 0)
7023 width = 0;
7024 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007025 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 if (width > len && !(flags & F_LJUST)) {
7027 do {
7028 --rescnt;
7029 *res++ = fill;
7030 } while (--width > len);
7031 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007032 if (fill == ' ') {
7033 if (sign)
7034 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007035 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007036 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007037 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007038 *res++ = *pbuf++;
7039 *res++ = *pbuf++;
7040 }
7041 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007042 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043 res += len;
7044 rescnt -= len;
7045 while (--width >= len) {
7046 --rescnt;
7047 *res++ = ' ';
7048 }
7049 if (dict && (argidx < arglen) && c != '%') {
7050 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007051 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052 goto onError;
7053 }
7054 Py_XDECREF(temp);
7055 } /* '%' */
7056 } /* until end */
7057 if (argidx < arglen && !dict) {
7058 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007059 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060 goto onError;
7061 }
7062
7063 if (args_owned) {
7064 Py_DECREF(args);
7065 }
7066 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00007067 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007068 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069 return (PyObject *)result;
7070
7071 onError:
7072 Py_XDECREF(result);
7073 Py_DECREF(uformat);
7074 if (args_owned) {
7075 Py_DECREF(args);
7076 }
7077 return NULL;
7078}
7079
7080static PyBufferProcs unicode_as_buffer = {
7081 (getreadbufferproc) unicode_buffer_getreadbuf,
7082 (getwritebufferproc) unicode_buffer_getwritebuf,
7083 (getsegcountproc) unicode_buffer_getsegcount,
7084 (getcharbufferproc) unicode_buffer_getcharbuf,
7085};
7086
Jeremy Hylton938ace62002-07-17 16:30:39 +00007087static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007088unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7089
Tim Peters6d6c1a32001-08-02 04:15:00 +00007090static PyObject *
7091unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7092{
7093 PyObject *x = NULL;
7094 static char *kwlist[] = {"string", "encoding", "errors", 0};
7095 char *encoding = NULL;
7096 char *errors = NULL;
7097
Guido van Rossume023fe02001-08-30 03:12:59 +00007098 if (type != &PyUnicode_Type)
7099 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007100 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7101 kwlist, &x, &encoding, &errors))
7102 return NULL;
7103 if (x == NULL)
7104 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007105 if (encoding == NULL && errors == NULL)
7106 return PyObject_Unicode(x);
7107 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007108 return PyUnicode_FromEncodedObject(x, encoding, errors);
7109}
7110
Guido van Rossume023fe02001-08-30 03:12:59 +00007111static PyObject *
7112unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7113{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007114 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007115 int n;
7116
7117 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7118 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7119 if (tmp == NULL)
7120 return NULL;
7121 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007122 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007123 if (pnew == NULL) {
7124 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007125 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007126 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007127 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7128 if (pnew->str == NULL) {
7129 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007130 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007131 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007132 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007133 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007134 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7135 pnew->length = n;
7136 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007137 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007138 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007139}
7140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007141PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007142"unicode(string [, encoding[, errors]]) -> object\n\
7143\n\
7144Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007145encoding defaults to the current default string encoding.\n\
7146errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007147
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148PyTypeObject PyUnicode_Type = {
7149 PyObject_HEAD_INIT(&PyType_Type)
7150 0, /* ob_size */
7151 "unicode", /* tp_name */
7152 sizeof(PyUnicodeObject), /* tp_size */
7153 0, /* tp_itemsize */
7154 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007155 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007157 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158 0, /* tp_setattr */
7159 (cmpfunc) unicode_compare, /* tp_compare */
7160 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007161 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007163 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164 (hashfunc) unicode_hash, /* tp_hash*/
7165 0, /* tp_call*/
7166 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007167 PyObject_GenericGetAttr, /* tp_getattro */
7168 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007170 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7171 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007172 unicode_doc, /* tp_doc */
7173 0, /* tp_traverse */
7174 0, /* tp_clear */
7175 0, /* tp_richcompare */
7176 0, /* tp_weaklistoffset */
7177 0, /* tp_iter */
7178 0, /* tp_iternext */
7179 unicode_methods, /* tp_methods */
7180 0, /* tp_members */
7181 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007182 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007183 0, /* tp_dict */
7184 0, /* tp_descr_get */
7185 0, /* tp_descr_set */
7186 0, /* tp_dictoffset */
7187 0, /* tp_init */
7188 0, /* tp_alloc */
7189 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007190 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191};
7192
7193/* Initialize the Unicode implementation */
7194
Thomas Wouters78890102000-07-22 19:25:51 +00007195void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007197 int i;
7198
Fred Drakee4315f52000-05-09 19:53:39 +00007199 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007200 unicode_freelist = NULL;
7201 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007203 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007204 for (i = 0; i < 256; i++)
7205 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007206 if (PyType_Ready(&PyUnicode_Type) < 0)
7207 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208}
7209
7210/* Finalize the Unicode implementation */
7211
7212void
Thomas Wouters78890102000-07-22 19:25:51 +00007213_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007215 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007216 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007218 Py_XDECREF(unicode_empty);
7219 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007220
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007221 for (i = 0; i < 256; i++) {
7222 if (unicode_latin1[i]) {
7223 Py_DECREF(unicode_latin1[i]);
7224 unicode_latin1[i] = NULL;
7225 }
7226 }
7227
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007228 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229 PyUnicodeObject *v = u;
7230 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007231 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007232 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007233 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007234 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007236 unicode_freelist = NULL;
7237 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007239
7240/*
7241Local variables:
7242c-basic-offset: 4
7243indent-tabs-mode: nil
7244End:
7245*/