blob: 668d239f353603a30216dc3456d7219d3eb4e5cf [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
157 unicode->length = length;
158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000266int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
371 for (i = size; i >= 0; i--)
372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
387 if (size > PyUnicode_GET_SIZE(unicode))
388 size = PyUnicode_GET_SIZE(unicode);
389#ifdef HAVE_USABLE_WCHAR_T
390 memcpy(w, unicode->str, size * sizeof(wchar_t));
391#else
392 {
393 register Py_UNICODE *u;
394 register int i;
395 u = PyUnicode_AS_UNICODE(unicode);
396 for (i = size; i >= 0; i--)
397 *w++ = *u++;
398 }
399#endif
400
401 return size;
402}
403
404#endif
405
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000406PyObject *PyUnicode_FromOrdinal(int ordinal)
407{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000408 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000409
410#ifdef Py_UNICODE_WIDE
411 if (ordinal < 0 || ordinal > 0x10ffff) {
412 PyErr_SetString(PyExc_ValueError,
413 "unichr() arg not in range(0x110000) "
414 "(wide Python build)");
415 return NULL;
416 }
417#else
418 if (ordinal < 0 || ordinal > 0xffff) {
419 PyErr_SetString(PyExc_ValueError,
420 "unichr() arg not in range(0x10000) "
421 "(narrow Python build)");
422 return NULL;
423 }
424#endif
425
Hye-Shik Chang40574832004-04-06 07:24:51 +0000426 s[0] = (Py_UNICODE)ordinal;
427 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000428}
429
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430PyObject *PyUnicode_FromObject(register PyObject *obj)
431{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000432 /* XXX Perhaps we should make this API an alias of
433 PyObject_Unicode() instead ?! */
434 if (PyUnicode_CheckExact(obj)) {
435 Py_INCREF(obj);
436 return obj;
437 }
438 if (PyUnicode_Check(obj)) {
439 /* For a Unicode subtype that's not a Unicode object,
440 return a true Unicode object with the same data. */
441 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
442 PyUnicode_GET_SIZE(obj));
443 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000444 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
445}
446
447PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
448 const char *encoding,
449 const char *errors)
450{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000451 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000453 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000454
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455 if (obj == NULL) {
456 PyErr_BadInternalCall();
457 return NULL;
458 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000460#if 0
461 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000462 that no encodings is given and then redirect to
463 PyObject_Unicode() which then applies the additional logic for
464 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000465
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000466 NOTE: This API should really only be used for object which
467 represent *encoded* Unicode !
468
469 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000470 if (PyUnicode_Check(obj)) {
471 if (encoding) {
472 PyErr_SetString(PyExc_TypeError,
473 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000474 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000475 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000476 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000477 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000478#else
479 if (PyUnicode_Check(obj)) {
480 PyErr_SetString(PyExc_TypeError,
481 "decoding Unicode is not supported");
482 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000484#endif
485
486 /* Coerce object */
487 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000488 s = PyString_AS_STRING(obj);
489 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000490 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
492 /* Overwrite the error message with something more useful in
493 case of a TypeError. */
494 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000495 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000496 "coercing to Unicode: need string or buffer, "
497 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000498 obj->ob_type->tp_name);
499 goto onError;
500 }
Tim Petersced69f82003-09-16 20:30:58 +0000501
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000502 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 if (len == 0) {
504 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000505 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000509
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000510 return v;
511
512 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514}
515
516PyObject *PyUnicode_Decode(const char *s,
517 int size,
518 const char *encoding,
519 const char *errors)
520{
521 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000522
523 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000524 encoding = PyUnicode_GetDefaultEncoding();
525
526 /* Shortcuts for common default encodings */
527 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000529 else if (strcmp(encoding, "latin-1") == 0)
530 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000531#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
532 else if (strcmp(encoding, "mbcs") == 0)
533 return PyUnicode_DecodeMBCS(s, size, errors);
534#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000535 else if (strcmp(encoding, "ascii") == 0)
536 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000537
538 /* Decode via the codec registry */
539 buffer = PyBuffer_FromMemory((void *)s, size);
540 if (buffer == NULL)
541 goto onError;
542 unicode = PyCodec_Decode(buffer, encoding, errors);
543 if (unicode == NULL)
544 goto onError;
545 if (!PyUnicode_Check(unicode)) {
546 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000547 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548 unicode->ob_type->tp_name);
549 Py_DECREF(unicode);
550 goto onError;
551 }
552 Py_DECREF(buffer);
553 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000554
Guido van Rossumd57fd912000-03-10 22:53:23 +0000555 onError:
556 Py_XDECREF(buffer);
557 return NULL;
558}
559
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000560PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
561 const char *encoding,
562 const char *errors)
563{
564 PyObject *v;
565
566 if (!PyUnicode_Check(unicode)) {
567 PyErr_BadArgument();
568 goto onError;
569 }
570
571 if (encoding == NULL)
572 encoding = PyUnicode_GetDefaultEncoding();
573
574 /* Decode via the codec registry */
575 v = PyCodec_Decode(unicode, encoding, errors);
576 if (v == NULL)
577 goto onError;
578 return v;
579
580 onError:
581 return NULL;
582}
583
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584PyObject *PyUnicode_Encode(const Py_UNICODE *s,
585 int size,
586 const char *encoding,
587 const char *errors)
588{
589 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000590
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591 unicode = PyUnicode_FromUnicode(s, size);
592 if (unicode == NULL)
593 return NULL;
594 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
595 Py_DECREF(unicode);
596 return v;
597}
598
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000599PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
600 const char *encoding,
601 const char *errors)
602{
603 PyObject *v;
604
605 if (!PyUnicode_Check(unicode)) {
606 PyErr_BadArgument();
607 goto onError;
608 }
609
610 if (encoding == NULL)
611 encoding = PyUnicode_GetDefaultEncoding();
612
613 /* Encode via the codec registry */
614 v = PyCodec_Encode(unicode, encoding, errors);
615 if (v == NULL)
616 goto onError;
617 return v;
618
619 onError:
620 return NULL;
621}
622
Guido van Rossumd57fd912000-03-10 22:53:23 +0000623PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
624 const char *encoding,
625 const char *errors)
626{
627 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629 if (!PyUnicode_Check(unicode)) {
630 PyErr_BadArgument();
631 goto onError;
632 }
Fred Drakee4315f52000-05-09 19:53:39 +0000633
Tim Petersced69f82003-09-16 20:30:58 +0000634 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000635 encoding = PyUnicode_GetDefaultEncoding();
636
637 /* Shortcuts for common default encodings */
638 if (errors == NULL) {
639 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000640 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000641 else if (strcmp(encoding, "latin-1") == 0)
642 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000643#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
644 else if (strcmp(encoding, "mbcs") == 0)
645 return PyUnicode_AsMBCSString(unicode);
646#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000647 else if (strcmp(encoding, "ascii") == 0)
648 return PyUnicode_AsASCIIString(unicode);
649 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650
651 /* Encode via the codec registry */
652 v = PyCodec_Encode(unicode, encoding, errors);
653 if (v == NULL)
654 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 if (!PyString_Check(v)) {
656 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000657 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 v->ob_type->tp_name);
659 Py_DECREF(v);
660 goto onError;
661 }
662 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000663
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 onError:
665 return NULL;
666}
667
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000668PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
669 const char *errors)
670{
671 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
672
673 if (v)
674 return v;
675 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
676 if (v && errors == NULL)
677 ((PyUnicodeObject *)unicode)->defenc = v;
678 return v;
679}
680
Guido van Rossumd57fd912000-03-10 22:53:23 +0000681Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
682{
683 if (!PyUnicode_Check(unicode)) {
684 PyErr_BadArgument();
685 goto onError;
686 }
687 return PyUnicode_AS_UNICODE(unicode);
688
689 onError:
690 return NULL;
691}
692
693int PyUnicode_GetSize(PyObject *unicode)
694{
695 if (!PyUnicode_Check(unicode)) {
696 PyErr_BadArgument();
697 goto onError;
698 }
699 return PyUnicode_GET_SIZE(unicode);
700
701 onError:
702 return -1;
703}
704
Thomas Wouters78890102000-07-22 19:25:51 +0000705const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000706{
707 return unicode_default_encoding;
708}
709
710int PyUnicode_SetDefaultEncoding(const char *encoding)
711{
712 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000713
Fred Drakee4315f52000-05-09 19:53:39 +0000714 /* Make sure the encoding is valid. As side effect, this also
715 loads the encoding into the codec registry cache. */
716 v = _PyCodec_Lookup(encoding);
717 if (v == NULL)
718 goto onError;
719 Py_DECREF(v);
720 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000721 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000722 sizeof(unicode_default_encoding));
723 return 0;
724
725 onError:
726 return -1;
727}
728
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000729/* error handling callback helper:
730 build arguments, call the callback and check the arguments,
731 if no exception occured, copy the replacement to the output
732 and adjust various state variables.
733 return 0 on success, -1 on error
734*/
735
736static
737int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
738 const char *encoding, const char *reason,
739 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
740 PyObject **output, int *outpos, Py_UNICODE **outptr)
741{
742 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
743
744 PyObject *restuple = NULL;
745 PyObject *repunicode = NULL;
746 int outsize = PyUnicode_GET_SIZE(*output);
747 int requiredsize;
748 int newpos;
749 Py_UNICODE *repptr;
750 int repsize;
751 int res = -1;
752
753 if (*errorHandler == NULL) {
754 *errorHandler = PyCodec_LookupError(errors);
755 if (*errorHandler == NULL)
756 goto onError;
757 }
758
759 if (*exceptionObject == NULL) {
760 *exceptionObject = PyUnicodeDecodeError_Create(
761 encoding, input, insize, *startinpos, *endinpos, reason);
762 if (*exceptionObject == NULL)
763 goto onError;
764 }
765 else {
766 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
767 goto onError;
768 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
769 goto onError;
770 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
771 goto onError;
772 }
773
774 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
775 if (restuple == NULL)
776 goto onError;
777 if (!PyTuple_Check(restuple)) {
778 PyErr_Format(PyExc_TypeError, &argparse[4]);
779 goto onError;
780 }
781 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
782 goto onError;
783 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000784 newpos = insize+newpos;
785 if (newpos<0 || newpos>insize) {
786 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
787 goto onError;
788 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000789
790 /* need more space? (at least enough for what we
791 have+the replacement+the rest of the string (starting
792 at the new input position), so we won't have to check space
793 when there are no errors in the rest of the string) */
794 repptr = PyUnicode_AS_UNICODE(repunicode);
795 repsize = PyUnicode_GET_SIZE(repunicode);
796 requiredsize = *outpos + repsize + insize-newpos;
797 if (requiredsize > outsize) {
798 if (requiredsize<2*outsize)
799 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000800 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000801 goto onError;
802 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
803 }
804 *endinpos = newpos;
805 *inptr = input + newpos;
806 Py_UNICODE_COPY(*outptr, repptr, repsize);
807 *outptr += repsize;
808 *outpos += repsize;
809 /* we made it! */
810 res = 0;
811
812 onError:
813 Py_XDECREF(restuple);
814 return res;
815}
816
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000817/* --- UTF-7 Codec -------------------------------------------------------- */
818
819/* see RFC2152 for details */
820
Tim Petersced69f82003-09-16 20:30:58 +0000821static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000822char utf7_special[128] = {
823 /* indicate whether a UTF-7 character is special i.e. cannot be directly
824 encoded:
825 0 - not special
826 1 - special
827 2 - whitespace (optional)
828 3 - RFC2152 Set O (optional) */
829 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
830 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
831 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
832 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
833 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
834 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
835 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
836 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
837
838};
839
840#define SPECIAL(c, encodeO, encodeWS) \
841 (((c)>127 || utf7_special[(c)] == 1) || \
842 (encodeWS && (utf7_special[(c)] == 2)) || \
843 (encodeO && (utf7_special[(c)] == 3)))
844
845#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
846#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
847#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
848 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
849
850#define ENCODE(out, ch, bits) \
851 while (bits >= 6) { \
852 *out++ = B64(ch >> (bits-6)); \
853 bits -= 6; \
854 }
855
856#define DECODE(out, ch, bits, surrogate) \
857 while (bits >= 16) { \
858 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
859 bits -= 16; \
860 if (surrogate) { \
861 /* We have already generated an error for the high surrogate
862 so let's not bother seeing if the low surrogate is correct or not */\
863 surrogate = 0; \
864 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
865 /* This is a surrogate pair. Unfortunately we can't represent \
866 it in a 16-bit character */ \
867 surrogate = 1; \
868 errmsg = "code pairs are not supported"; \
869 goto utf7Error; \
870 } else { \
871 *out++ = outCh; \
872 } \
873 } \
874
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000875PyObject *PyUnicode_DecodeUTF7(const char *s,
876 int size,
877 const char *errors)
878{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000879 const char *starts = s;
880 int startinpos;
881 int endinpos;
882 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000883 const char *e;
884 PyUnicodeObject *unicode;
885 Py_UNICODE *p;
886 const char *errmsg = "";
887 int inShift = 0;
888 unsigned int bitsleft = 0;
889 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000890 int surrogate = 0;
891 PyObject *errorHandler = NULL;
892 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000893
894 unicode = _PyUnicode_New(size);
895 if (!unicode)
896 return NULL;
897 if (size == 0)
898 return (PyObject *)unicode;
899
900 p = unicode->str;
901 e = s + size;
902
903 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000904 Py_UNICODE ch;
905 restart:
906 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907
908 if (inShift) {
909 if ((ch == '-') || !B64CHAR(ch)) {
910 inShift = 0;
911 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000912
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000913 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
914 if (bitsleft >= 6) {
915 /* The shift sequence has a partial character in it. If
916 bitsleft < 6 then we could just classify it as padding
917 but that is not the case here */
918
919 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000920 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921 }
922 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000923 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000924 here so indicate the potential of a misencoded character. */
925
926 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
927 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
928 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000929 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000930 }
931
932 if (ch == '-') {
933 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000934 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000935 inShift = 1;
936 }
937 } else if (SPECIAL(ch,0,0)) {
938 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000939 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000940 } else {
941 *p++ = ch;
942 }
943 } else {
944 charsleft = (charsleft << 6) | UB64(ch);
945 bitsleft += 6;
946 s++;
947 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
948 }
949 }
950 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000951 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000952 s++;
953 if (s < e && *s == '-') {
954 s++;
955 *p++ = '+';
956 } else
957 {
958 inShift = 1;
959 bitsleft = 0;
960 }
961 }
962 else if (SPECIAL(ch,0,0)) {
963 errmsg = "unexpected special character";
964 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000965 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000966 }
967 else {
968 *p++ = ch;
969 s++;
970 }
971 continue;
972 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000973 outpos = p-PyUnicode_AS_UNICODE(unicode);
974 endinpos = s-starts;
975 if (unicode_decode_call_errorhandler(
976 errors, &errorHandler,
977 "utf7", errmsg,
978 starts, size, &startinpos, &endinpos, &exc, &s,
979 (PyObject **)&unicode, &outpos, &p))
980 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000981 }
982
983 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000984 outpos = p-PyUnicode_AS_UNICODE(unicode);
985 endinpos = size;
986 if (unicode_decode_call_errorhandler(
987 errors, &errorHandler,
988 "utf7", "unterminated shift sequence",
989 starts, size, &startinpos, &endinpos, &exc, &s,
990 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000991 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000992 if (s < e)
993 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000994 }
995
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000996 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 goto onError;
998
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000999 Py_XDECREF(errorHandler);
1000 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001001 return (PyObject *)unicode;
1002
1003onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001004 Py_XDECREF(errorHandler);
1005 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001006 Py_DECREF(unicode);
1007 return NULL;
1008}
1009
1010
1011PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1012 int size,
1013 int encodeSetO,
1014 int encodeWhiteSpace,
1015 const char *errors)
1016{
1017 PyObject *v;
1018 /* It might be possible to tighten this worst case */
1019 unsigned int cbAllocated = 5 * size;
1020 int inShift = 0;
1021 int i = 0;
1022 unsigned int bitsleft = 0;
1023 unsigned long charsleft = 0;
1024 char * out;
1025 char * start;
1026
1027 if (size == 0)
1028 return PyString_FromStringAndSize(NULL, 0);
1029
1030 v = PyString_FromStringAndSize(NULL, cbAllocated);
1031 if (v == NULL)
1032 return NULL;
1033
1034 start = out = PyString_AS_STRING(v);
1035 for (;i < size; ++i) {
1036 Py_UNICODE ch = s[i];
1037
1038 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001039 if (ch == '+') {
1040 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001041 *out++ = '-';
1042 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1043 charsleft = ch;
1044 bitsleft = 16;
1045 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001046 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001047 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001048 } else {
1049 *out++ = (char) ch;
1050 }
1051 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001052 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1053 *out++ = B64(charsleft << (6-bitsleft));
1054 charsleft = 0;
1055 bitsleft = 0;
1056 /* Characters not in the BASE64 set implicitly unshift the sequence
1057 so no '-' is required, except if the character is itself a '-' */
1058 if (B64CHAR(ch) || ch == '-') {
1059 *out++ = '-';
1060 }
1061 inShift = 0;
1062 *out++ = (char) ch;
1063 } else {
1064 bitsleft += 16;
1065 charsleft = (charsleft << 16) | ch;
1066 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1067
1068 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001069 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001070 or '-' then the shift sequence will be terminated implicitly and we
1071 don't have to insert a '-'. */
1072
1073 if (bitsleft == 0) {
1074 if (i + 1 < size) {
1075 Py_UNICODE ch2 = s[i+1];
1076
1077 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001078
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001079 } else if (B64CHAR(ch2) || ch2 == '-') {
1080 *out++ = '-';
1081 inShift = 0;
1082 } else {
1083 inShift = 0;
1084 }
1085
1086 }
1087 else {
1088 *out++ = '-';
1089 inShift = 0;
1090 }
1091 }
Tim Petersced69f82003-09-16 20:30:58 +00001092 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001093 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001094 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001095 if (bitsleft) {
1096 *out++= B64(charsleft << (6-bitsleft) );
1097 *out++ = '-';
1098 }
1099
Tim Peters5de98422002-04-27 18:44:32 +00001100 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001101 return v;
1102}
1103
1104#undef SPECIAL
1105#undef B64
1106#undef B64CHAR
1107#undef UB64
1108#undef ENCODE
1109#undef DECODE
1110
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111/* --- UTF-8 Codec -------------------------------------------------------- */
1112
Tim Petersced69f82003-09-16 20:30:58 +00001113static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114char utf8_code_length[256] = {
1115 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1116 illegal prefix. see RFC 2279 for details */
1117 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1118 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1119 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1120 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1121 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1122 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1124 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1126 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1129 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1130 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1131 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1132 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1133};
1134
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135PyObject *PyUnicode_DecodeUTF8(const char *s,
1136 int size,
1137 const char *errors)
1138{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001139 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001141 int startinpos;
1142 int endinpos;
1143 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144 const char *e;
1145 PyUnicodeObject *unicode;
1146 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001147 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001148 PyObject *errorHandler = NULL;
1149 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150
1151 /* Note: size will always be longer than the resulting Unicode
1152 character count */
1153 unicode = _PyUnicode_New(size);
1154 if (!unicode)
1155 return NULL;
1156 if (size == 0)
1157 return (PyObject *)unicode;
1158
1159 /* Unpack UTF-8 encoded data */
1160 p = unicode->str;
1161 e = s + size;
1162
1163 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001164 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165
1166 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001167 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 s++;
1169 continue;
1170 }
1171
1172 n = utf8_code_length[ch];
1173
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001174 if (s + n > e) {
1175 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001176 startinpos = s-starts;
1177 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001178 goto utf8Error;
1179 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180
1181 switch (n) {
1182
1183 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001184 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001185 startinpos = s-starts;
1186 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001187 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188
1189 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001190 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001191 startinpos = s-starts;
1192 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001193 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194
1195 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001196 if ((s[1] & 0xc0) != 0x80) {
1197 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001198 startinpos = s-starts;
1199 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001200 goto utf8Error;
1201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001203 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001204 startinpos = s-starts;
1205 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001206 errmsg = "illegal encoding";
1207 goto utf8Error;
1208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001210 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 break;
1212
1213 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001214 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001215 (s[2] & 0xc0) != 0x80) {
1216 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001217 startinpos = s-starts;
1218 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001219 goto utf8Error;
1220 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001222 if (ch < 0x0800) {
1223 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001224 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001225
1226 XXX For wide builds (UCS-4) we should probably try
1227 to recombine the surrogates into a single code
1228 unit.
1229 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001230 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001231 startinpos = s-starts;
1232 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001233 goto utf8Error;
1234 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001236 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001237 break;
1238
1239 case 4:
1240 if ((s[1] & 0xc0) != 0x80 ||
1241 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001242 (s[3] & 0xc0) != 0x80) {
1243 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001244 startinpos = s-starts;
1245 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001246 goto utf8Error;
1247 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001248 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1249 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1250 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001251 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001252 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001253 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001254 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001255 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001256 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001257 startinpos = s-starts;
1258 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001259 goto utf8Error;
1260 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001261#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001262 *p++ = (Py_UNICODE)ch;
1263#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001264 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001265
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001266 /* translate from 10000..10FFFF to 0..FFFF */
1267 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001268
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001269 /* high surrogate = top 10 bits added to D800 */
1270 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001271
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001272 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001273 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001274#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275 break;
1276
1277 default:
1278 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001279 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 }
1284 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001286
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001287 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001288 outpos = p-PyUnicode_AS_UNICODE(unicode);
1289 if (unicode_decode_call_errorhandler(
1290 errors, &errorHandler,
1291 "utf8", errmsg,
1292 starts, size, &startinpos, &endinpos, &exc, &s,
1293 (PyObject **)&unicode, &outpos, &p))
1294 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001295 }
1296
1297 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001298 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001299 goto onError;
1300
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001301 Py_XDECREF(errorHandler);
1302 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 return (PyObject *)unicode;
1304
1305onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001306 Py_XDECREF(errorHandler);
1307 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 Py_DECREF(unicode);
1309 return NULL;
1310}
1311
Tim Peters602f7402002-04-27 18:03:26 +00001312/* Allocation strategy: if the string is short, convert into a stack buffer
1313 and allocate exactly as much space needed at the end. Else allocate the
1314 maximum possible needed (4 result bytes per Unicode character), and return
1315 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001316*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001317PyObject *
1318PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1319 int size,
1320 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321{
Tim Peters602f7402002-04-27 18:03:26 +00001322#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001323
Tim Peters602f7402002-04-27 18:03:26 +00001324 int i; /* index into s of next input byte */
1325 PyObject *v; /* result string object */
1326 char *p; /* next free byte in output buffer */
1327 int nallocated; /* number of result bytes allocated */
1328 int nneeded; /* number of result bytes needed */
1329 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001330
Tim Peters602f7402002-04-27 18:03:26 +00001331 assert(s != NULL);
1332 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001333
Tim Peters602f7402002-04-27 18:03:26 +00001334 if (size <= MAX_SHORT_UNICHARS) {
1335 /* Write into the stack buffer; nallocated can't overflow.
1336 * At the end, we'll allocate exactly as much heap space as it
1337 * turns out we need.
1338 */
1339 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1340 v = NULL; /* will allocate after we're done */
1341 p = stackbuf;
1342 }
1343 else {
1344 /* Overallocate on the heap, and give the excess back at the end. */
1345 nallocated = size * 4;
1346 if (nallocated / 4 != size) /* overflow! */
1347 return PyErr_NoMemory();
1348 v = PyString_FromStringAndSize(NULL, nallocated);
1349 if (v == NULL)
1350 return NULL;
1351 p = PyString_AS_STRING(v);
1352 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001353
Tim Peters602f7402002-04-27 18:03:26 +00001354 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001355 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001356
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001357 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001358 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001360
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001362 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001363 *p++ = (char)(0xc0 | (ch >> 6));
1364 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001365 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001366 else {
Tim Peters602f7402002-04-27 18:03:26 +00001367 /* Encode UCS2 Unicode ordinals */
1368 if (ch < 0x10000) {
1369 /* Special case: check for high surrogate */
1370 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1371 Py_UCS4 ch2 = s[i];
1372 /* Check for low surrogate and combine the two to
1373 form a UCS4 value */
1374 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001375 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001376 i++;
1377 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001378 }
Tim Peters602f7402002-04-27 18:03:26 +00001379 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001380 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001381 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001382 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1383 *p++ = (char)(0x80 | (ch & 0x3f));
1384 continue;
1385 }
1386encodeUCS4:
1387 /* Encode UCS4 Unicode ordinals */
1388 *p++ = (char)(0xf0 | (ch >> 18));
1389 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1390 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1391 *p++ = (char)(0x80 | (ch & 0x3f));
1392 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001394
Tim Peters602f7402002-04-27 18:03:26 +00001395 if (v == NULL) {
1396 /* This was stack allocated. */
1397 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1398 assert(nneeded <= nallocated);
1399 v = PyString_FromStringAndSize(stackbuf, nneeded);
1400 }
1401 else {
1402 /* Cut back to size actually needed. */
1403 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1404 assert(nneeded <= nallocated);
1405 _PyString_Resize(&v, nneeded);
1406 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001407 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001408
Tim Peters602f7402002-04-27 18:03:26 +00001409#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410}
1411
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1413{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001414 if (!PyUnicode_Check(unicode)) {
1415 PyErr_BadArgument();
1416 return NULL;
1417 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001418 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1419 PyUnicode_GET_SIZE(unicode),
1420 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421}
1422
1423/* --- UTF-16 Codec ------------------------------------------------------- */
1424
Tim Peters772747b2001-08-09 22:21:55 +00001425PyObject *
1426PyUnicode_DecodeUTF16(const char *s,
1427 int size,
1428 const char *errors,
1429 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001431 const char *starts = s;
1432 int startinpos;
1433 int endinpos;
1434 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435 PyUnicodeObject *unicode;
1436 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001437 const unsigned char *q, *e;
1438 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001439 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001440 /* Offsets from q for retrieving byte pairs in the right order. */
1441#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1442 int ihi = 1, ilo = 0;
1443#else
1444 int ihi = 0, ilo = 1;
1445#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001446 PyObject *errorHandler = NULL;
1447 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001448
1449 /* Note: size will always be longer than the resulting Unicode
1450 character count */
1451 unicode = _PyUnicode_New(size);
1452 if (!unicode)
1453 return NULL;
1454 if (size == 0)
1455 return (PyObject *)unicode;
1456
1457 /* Unpack UTF-16 encoded data */
1458 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001459 q = (unsigned char *)s;
1460 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461
1462 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001463 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001464
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001465 /* Check for BOM marks (U+FEFF) in the input and adjust current
1466 byte order setting accordingly. In native mode, the leading BOM
1467 mark is skipped, in all other modes, it is copied to the output
1468 stream as-is (giving a ZWNBSP character). */
1469 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001470 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001471#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001472 if (bom == 0xFEFF) {
1473 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001474 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001475 }
1476 else if (bom == 0xFFFE) {
1477 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001478 bo = 1;
1479 }
Tim Petersced69f82003-09-16 20:30:58 +00001480#else
Tim Peters772747b2001-08-09 22:21:55 +00001481 if (bom == 0xFEFF) {
1482 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001483 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001484 }
1485 else if (bom == 0xFFFE) {
1486 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001487 bo = -1;
1488 }
1489#endif
1490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491
Tim Peters772747b2001-08-09 22:21:55 +00001492 if (bo == -1) {
1493 /* force LE */
1494 ihi = 1;
1495 ilo = 0;
1496 }
1497 else if (bo == 1) {
1498 /* force BE */
1499 ihi = 0;
1500 ilo = 1;
1501 }
1502
1503 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001504 Py_UNICODE ch;
1505 /* remaing bytes at the end? (size should be even) */
1506 if (e-q<2) {
1507 errmsg = "truncated data";
1508 startinpos = ((const char *)q)-starts;
1509 endinpos = ((const char *)e)-starts;
1510 goto utf16Error;
1511 /* The remaining input chars are ignored if the callback
1512 chooses to skip the input */
1513 }
1514 ch = (q[ihi] << 8) | q[ilo];
1515
Tim Peters772747b2001-08-09 22:21:55 +00001516 q += 2;
1517
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518 if (ch < 0xD800 || ch > 0xDFFF) {
1519 *p++ = ch;
1520 continue;
1521 }
1522
1523 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001524 if (q >= e) {
1525 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001526 startinpos = (((const char *)q)-2)-starts;
1527 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001528 goto utf16Error;
1529 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001530 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001531 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1532 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001533 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001534#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001535 *p++ = ch;
1536 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001537#else
1538 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001539#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001540 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001541 }
1542 else {
1543 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001544 startinpos = (((const char *)q)-4)-starts;
1545 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001546 goto utf16Error;
1547 }
1548
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001550 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001551 startinpos = (((const char *)q)-2)-starts;
1552 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001553 /* Fall through to report the error */
1554
1555 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001556 outpos = p-PyUnicode_AS_UNICODE(unicode);
1557 if (unicode_decode_call_errorhandler(
1558 errors, &errorHandler,
1559 "utf16", errmsg,
1560 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1561 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001562 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563 }
1564
1565 if (byteorder)
1566 *byteorder = bo;
1567
1568 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001569 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001570 goto onError;
1571
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001572 Py_XDECREF(errorHandler);
1573 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001574 return (PyObject *)unicode;
1575
1576onError:
1577 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001578 Py_XDECREF(errorHandler);
1579 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001580 return NULL;
1581}
1582
Tim Peters772747b2001-08-09 22:21:55 +00001583PyObject *
1584PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1585 int size,
1586 const char *errors,
1587 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001588{
1589 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001590 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001591#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001592 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001593#else
1594 const int pairs = 0;
1595#endif
Tim Peters772747b2001-08-09 22:21:55 +00001596 /* Offsets from p for storing byte pairs in the right order. */
1597#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1598 int ihi = 1, ilo = 0;
1599#else
1600 int ihi = 0, ilo = 1;
1601#endif
1602
1603#define STORECHAR(CH) \
1604 do { \
1605 p[ihi] = ((CH) >> 8) & 0xff; \
1606 p[ilo] = (CH) & 0xff; \
1607 p += 2; \
1608 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001609
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001610#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001611 for (i = pairs = 0; i < size; i++)
1612 if (s[i] >= 0x10000)
1613 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001614#endif
Tim Petersced69f82003-09-16 20:30:58 +00001615 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001616 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001617 if (v == NULL)
1618 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619
Tim Peters772747b2001-08-09 22:21:55 +00001620 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001621 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001622 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001623 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001624 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001625
1626 if (byteorder == -1) {
1627 /* force LE */
1628 ihi = 1;
1629 ilo = 0;
1630 }
1631 else if (byteorder == 1) {
1632 /* force BE */
1633 ihi = 0;
1634 ilo = 1;
1635 }
1636
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001637 while (size-- > 0) {
1638 Py_UNICODE ch = *s++;
1639 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001640#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001641 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001642 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1643 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001644 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001645#endif
Tim Peters772747b2001-08-09 22:21:55 +00001646 STORECHAR(ch);
1647 if (ch2)
1648 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001649 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001650 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001651#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652}
1653
1654PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1655{
1656 if (!PyUnicode_Check(unicode)) {
1657 PyErr_BadArgument();
1658 return NULL;
1659 }
1660 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1661 PyUnicode_GET_SIZE(unicode),
1662 NULL,
1663 0);
1664}
1665
1666/* --- Unicode Escape Codec ----------------------------------------------- */
1667
Fredrik Lundh06d12682001-01-24 07:59:11 +00001668static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001669
Guido van Rossumd57fd912000-03-10 22:53:23 +00001670PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1671 int size,
1672 const char *errors)
1673{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001674 const char *starts = s;
1675 int startinpos;
1676 int endinpos;
1677 int outpos;
1678 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001680 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001682 char* message;
1683 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001684 PyObject *errorHandler = NULL;
1685 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001686
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687 /* Escaped strings will always be longer than the resulting
1688 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001689 length after conversion to the true value.
1690 (but if the error callback returns a long replacement string
1691 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 v = _PyUnicode_New(size);
1693 if (v == NULL)
1694 goto onError;
1695 if (size == 0)
1696 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001697
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001698 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001699 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001700
Guido van Rossumd57fd912000-03-10 22:53:23 +00001701 while (s < end) {
1702 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001703 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001704 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001705
1706 /* Non-escape characters are interpreted as Unicode ordinals */
1707 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001708 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001709 continue;
1710 }
1711
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001712 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713 /* \ - Escapes */
1714 s++;
1715 switch (*s++) {
1716
1717 /* \x escapes */
1718 case '\n': break;
1719 case '\\': *p++ = '\\'; break;
1720 case '\'': *p++ = '\''; break;
1721 case '\"': *p++ = '\"'; break;
1722 case 'b': *p++ = '\b'; break;
1723 case 'f': *p++ = '\014'; break; /* FF */
1724 case 't': *p++ = '\t'; break;
1725 case 'n': *p++ = '\n'; break;
1726 case 'r': *p++ = '\r'; break;
1727 case 'v': *p++ = '\013'; break; /* VT */
1728 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1729
1730 /* \OOO (octal) escapes */
1731 case '0': case '1': case '2': case '3':
1732 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001733 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001735 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001737 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001739 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001740 break;
1741
Fredrik Lundhccc74732001-02-18 22:13:49 +00001742 /* hex escapes */
1743 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001744 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001745 digits = 2;
1746 message = "truncated \\xXX escape";
1747 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748
Fredrik Lundhccc74732001-02-18 22:13:49 +00001749 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001751 digits = 4;
1752 message = "truncated \\uXXXX escape";
1753 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754
Fredrik Lundhccc74732001-02-18 22:13:49 +00001755 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001756 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001757 digits = 8;
1758 message = "truncated \\UXXXXXXXX escape";
1759 hexescape:
1760 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001761 outpos = p-PyUnicode_AS_UNICODE(v);
1762 if (s+digits>end) {
1763 endinpos = size;
1764 if (unicode_decode_call_errorhandler(
1765 errors, &errorHandler,
1766 "unicodeescape", "end of string in escape sequence",
1767 starts, size, &startinpos, &endinpos, &exc, &s,
1768 (PyObject **)&v, &outpos, &p))
1769 goto onError;
1770 goto nextByte;
1771 }
1772 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001773 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001774 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 endinpos = (s+i+1)-starts;
1776 if (unicode_decode_call_errorhandler(
1777 errors, &errorHandler,
1778 "unicodeescape", message,
1779 starts, size, &startinpos, &endinpos, &exc, &s,
1780 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001781 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001782 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001783 }
1784 chr = (chr<<4) & ~0xF;
1785 if (c >= '0' && c <= '9')
1786 chr += c - '0';
1787 else if (c >= 'a' && c <= 'f')
1788 chr += 10 + c - 'a';
1789 else
1790 chr += 10 + c - 'A';
1791 }
1792 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001793 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001794 /* _decoding_error will have already written into the
1795 target buffer. */
1796 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001797 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001798 /* when we get here, chr is a 32-bit unicode character */
1799 if (chr <= 0xffff)
1800 /* UCS-2 character */
1801 *p++ = (Py_UNICODE) chr;
1802 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001803 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001804 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001805#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001806 *p++ = chr;
1807#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001808 chr -= 0x10000L;
1809 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001810 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001811#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001812 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001813 endinpos = s-starts;
1814 outpos = p-PyUnicode_AS_UNICODE(v);
1815 if (unicode_decode_call_errorhandler(
1816 errors, &errorHandler,
1817 "unicodeescape", "illegal Unicode character",
1818 starts, size, &startinpos, &endinpos, &exc, &s,
1819 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001820 goto onError;
1821 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001822 break;
1823
1824 /* \N{name} */
1825 case 'N':
1826 message = "malformed \\N character escape";
1827 if (ucnhash_CAPI == NULL) {
1828 /* load the unicode data module */
1829 PyObject *m, *v;
1830 m = PyImport_ImportModule("unicodedata");
1831 if (m == NULL)
1832 goto ucnhashError;
1833 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1834 Py_DECREF(m);
1835 if (v == NULL)
1836 goto ucnhashError;
1837 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1838 Py_DECREF(v);
1839 if (ucnhash_CAPI == NULL)
1840 goto ucnhashError;
1841 }
1842 if (*s == '{') {
1843 const char *start = s+1;
1844 /* look for the closing brace */
1845 while (*s != '}' && s < end)
1846 s++;
1847 if (s > start && s < end && *s == '}') {
1848 /* found a name. look it up in the unicode database */
1849 message = "unknown Unicode character name";
1850 s++;
1851 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1852 goto store;
1853 }
1854 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001855 endinpos = s-starts;
1856 outpos = p-PyUnicode_AS_UNICODE(v);
1857 if (unicode_decode_call_errorhandler(
1858 errors, &errorHandler,
1859 "unicodeescape", message,
1860 starts, size, &startinpos, &endinpos, &exc, &s,
1861 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001862 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001863 break;
1864
1865 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001866 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 message = "\\ at end of string";
1868 s--;
1869 endinpos = s-starts;
1870 outpos = p-PyUnicode_AS_UNICODE(v);
1871 if (unicode_decode_call_errorhandler(
1872 errors, &errorHandler,
1873 "unicodeescape", message,
1874 starts, size, &startinpos, &endinpos, &exc, &s,
1875 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001876 goto onError;
1877 }
1878 else {
1879 *p++ = '\\';
1880 *p++ = (unsigned char)s[-1];
1881 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001882 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001884 nextByte:
1885 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001887 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001888 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001889 Py_XDECREF(errorHandler);
1890 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001891 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001892
Fredrik Lundhccc74732001-02-18 22:13:49 +00001893ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001894 PyErr_SetString(
1895 PyExc_UnicodeError,
1896 "\\N escapes not supported (can't load unicodedata module)"
1897 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001898 Py_XDECREF(errorHandler);
1899 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001900 return NULL;
1901
Fredrik Lundhccc74732001-02-18 22:13:49 +00001902onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001903 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001904 Py_XDECREF(errorHandler);
1905 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001906 return NULL;
1907}
1908
1909/* Return a Unicode-Escape string version of the Unicode object.
1910
1911 If quotes is true, the string is enclosed in u"" or u'' quotes as
1912 appropriate.
1913
1914*/
1915
Barry Warsaw51ac5802000-03-20 16:36:48 +00001916static const Py_UNICODE *findchar(const Py_UNICODE *s,
1917 int size,
1918 Py_UNICODE ch);
1919
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920static
1921PyObject *unicodeescape_string(const Py_UNICODE *s,
1922 int size,
1923 int quotes)
1924{
1925 PyObject *repr;
1926 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001928 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929
1930 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1931 if (repr == NULL)
1932 return NULL;
1933
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001934 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935
1936 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001938 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 !findchar(s, size, '"')) ? '"' : '\'';
1940 }
1941 while (size-- > 0) {
1942 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001943
Guido van Rossumd57fd912000-03-10 22:53:23 +00001944 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001945 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001946 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 *p++ = '\\';
1948 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001949 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001950 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001951
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001952#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001953 /* Map 21-bit characters to '\U00xxxxxx' */
1954 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001955 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00001956
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001957 /* Resize the string if necessary */
1958 if (offset + 12 > PyString_GET_SIZE(repr)) {
1959 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001960 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001961 p = PyString_AS_STRING(repr) + offset;
1962 }
1963
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001964 *p++ = '\\';
1965 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001966 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1967 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1968 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1969 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1970 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1971 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1972 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001973 *p++ = hexdigit[ch & 0x0000000F];
1974 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001975 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001976#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001977 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1978 else if (ch >= 0xD800 && ch < 0xDC00) {
1979 Py_UNICODE ch2;
1980 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00001981
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001982 ch2 = *s++;
1983 size--;
1984 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1985 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1986 *p++ = '\\';
1987 *p++ = 'U';
1988 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1989 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1990 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1991 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1992 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1993 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1994 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1995 *p++ = hexdigit[ucs & 0x0000000F];
1996 continue;
1997 }
1998 /* Fall through: isolated surrogates are copied as-is */
1999 s--;
2000 size++;
2001 }
2002
Guido van Rossumd57fd912000-03-10 22:53:23 +00002003 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002004 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 *p++ = '\\';
2006 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002007 *p++ = hexdigit[(ch >> 12) & 0x000F];
2008 *p++ = hexdigit[(ch >> 8) & 0x000F];
2009 *p++ = hexdigit[(ch >> 4) & 0x000F];
2010 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002012
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002013 /* Map special whitespace to '\t', \n', '\r' */
2014 else if (ch == '\t') {
2015 *p++ = '\\';
2016 *p++ = 't';
2017 }
2018 else if (ch == '\n') {
2019 *p++ = '\\';
2020 *p++ = 'n';
2021 }
2022 else if (ch == '\r') {
2023 *p++ = '\\';
2024 *p++ = 'r';
2025 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002026
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002027 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002028 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002029 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002030 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002031 *p++ = hexdigit[(ch >> 4) & 0x000F];
2032 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002033 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002034
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 /* Copy everything else as-is */
2036 else
2037 *p++ = (char) ch;
2038 }
2039 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002040 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041
2042 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002043 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 return repr;
2045}
2046
2047PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2048 int size)
2049{
2050 return unicodeescape_string(s, size, 0);
2051}
2052
2053PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2054{
2055 if (!PyUnicode_Check(unicode)) {
2056 PyErr_BadArgument();
2057 return NULL;
2058 }
2059 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2060 PyUnicode_GET_SIZE(unicode));
2061}
2062
2063/* --- Raw Unicode Escape Codec ------------------------------------------- */
2064
2065PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2066 int size,
2067 const char *errors)
2068{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002069 const char *starts = s;
2070 int startinpos;
2071 int endinpos;
2072 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002074 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075 const char *end;
2076 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002077 PyObject *errorHandler = NULL;
2078 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002079
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080 /* Escaped strings will always be longer than the resulting
2081 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002082 length after conversion to the true value. (But decoding error
2083 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084 v = _PyUnicode_New(size);
2085 if (v == NULL)
2086 goto onError;
2087 if (size == 0)
2088 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002089 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002090 end = s + size;
2091 while (s < end) {
2092 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002093 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002095 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096
2097 /* Non-escape characters are interpreted as Unicode ordinals */
2098 if (*s != '\\') {
2099 *p++ = (unsigned char)*s++;
2100 continue;
2101 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002102 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002103
2104 /* \u-escapes are only interpreted iff the number of leading
2105 backslashes if odd */
2106 bs = s;
2107 for (;s < end;) {
2108 if (*s != '\\')
2109 break;
2110 *p++ = (unsigned char)*s++;
2111 }
2112 if (((s - bs) & 1) == 0 ||
2113 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002114 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002115 continue;
2116 }
2117 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002118 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002119 s++;
2120
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002121 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002122 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002123 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002124 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002126 endinpos = s-starts;
2127 if (unicode_decode_call_errorhandler(
2128 errors, &errorHandler,
2129 "rawunicodeescape", "truncated \\uXXXX",
2130 starts, size, &startinpos, &endinpos, &exc, &s,
2131 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002132 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002133 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134 }
2135 x = (x<<4) & ~0xF;
2136 if (c >= '0' && c <= '9')
2137 x += c - '0';
2138 else if (c >= 'a' && c <= 'f')
2139 x += 10 + c - 'a';
2140 else
2141 x += 10 + c - 'A';
2142 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002143#ifndef Py_UNICODE_WIDE
2144 if (x > 0x10000) {
2145 if (unicode_decode_call_errorhandler(
2146 errors, &errorHandler,
2147 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2148 starts, size, &startinpos, &endinpos, &exc, &s,
2149 (PyObject **)&v, &outpos, &p))
2150 goto onError;
2151 }
2152#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002153 *p++ = x;
2154 nextByte:
2155 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002157 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002158 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002159 Py_XDECREF(errorHandler);
2160 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002162
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163 onError:
2164 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002165 Py_XDECREF(errorHandler);
2166 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167 return NULL;
2168}
2169
2170PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2171 int size)
2172{
2173 PyObject *repr;
2174 char *p;
2175 char *q;
2176
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002177 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002178
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002179#ifdef Py_UNICODE_WIDE
2180 repr = PyString_FromStringAndSize(NULL, 10 * size);
2181#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002183#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002184 if (repr == NULL)
2185 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002186 if (size == 0)
2187 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188
2189 p = q = PyString_AS_STRING(repr);
2190 while (size-- > 0) {
2191 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002192#ifdef Py_UNICODE_WIDE
2193 /* Map 32-bit characters to '\Uxxxxxxxx' */
2194 if (ch >= 0x10000) {
2195 *p++ = '\\';
2196 *p++ = 'U';
2197 *p++ = hexdigit[(ch >> 28) & 0xf];
2198 *p++ = hexdigit[(ch >> 24) & 0xf];
2199 *p++ = hexdigit[(ch >> 20) & 0xf];
2200 *p++ = hexdigit[(ch >> 16) & 0xf];
2201 *p++ = hexdigit[(ch >> 12) & 0xf];
2202 *p++ = hexdigit[(ch >> 8) & 0xf];
2203 *p++ = hexdigit[(ch >> 4) & 0xf];
2204 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002205 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002206 else
2207#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002208 /* Map 16-bit characters to '\uxxxx' */
2209 if (ch >= 256) {
2210 *p++ = '\\';
2211 *p++ = 'u';
2212 *p++ = hexdigit[(ch >> 12) & 0xf];
2213 *p++ = hexdigit[(ch >> 8) & 0xf];
2214 *p++ = hexdigit[(ch >> 4) & 0xf];
2215 *p++ = hexdigit[ch & 15];
2216 }
2217 /* Copy everything else as-is */
2218 else
2219 *p++ = (char) ch;
2220 }
2221 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002222 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223 return repr;
2224}
2225
2226PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2227{
2228 if (!PyUnicode_Check(unicode)) {
2229 PyErr_BadArgument();
2230 return NULL;
2231 }
2232 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2233 PyUnicode_GET_SIZE(unicode));
2234}
2235
2236/* --- Latin-1 Codec ------------------------------------------------------ */
2237
2238PyObject *PyUnicode_DecodeLatin1(const char *s,
2239 int size,
2240 const char *errors)
2241{
2242 PyUnicodeObject *v;
2243 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002244
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002246 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002247 Py_UNICODE r = *(unsigned char*)s;
2248 return PyUnicode_FromUnicode(&r, 1);
2249 }
2250
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251 v = _PyUnicode_New(size);
2252 if (v == NULL)
2253 goto onError;
2254 if (size == 0)
2255 return (PyObject *)v;
2256 p = PyUnicode_AS_UNICODE(v);
2257 while (size-- > 0)
2258 *p++ = (unsigned char)*s++;
2259 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002260
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 onError:
2262 Py_XDECREF(v);
2263 return NULL;
2264}
2265
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266/* create or adjust a UnicodeEncodeError */
2267static void make_encode_exception(PyObject **exceptionObject,
2268 const char *encoding,
2269 const Py_UNICODE *unicode, int size,
2270 int startpos, int endpos,
2271 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002273 if (*exceptionObject == NULL) {
2274 *exceptionObject = PyUnicodeEncodeError_Create(
2275 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 }
2277 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002278 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2279 goto onError;
2280 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2281 goto onError;
2282 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2283 goto onError;
2284 return;
2285 onError:
2286 Py_DECREF(*exceptionObject);
2287 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288 }
2289}
2290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002291/* raises a UnicodeEncodeError */
2292static void raise_encode_exception(PyObject **exceptionObject,
2293 const char *encoding,
2294 const Py_UNICODE *unicode, int size,
2295 int startpos, int endpos,
2296 const char *reason)
2297{
2298 make_encode_exception(exceptionObject,
2299 encoding, unicode, size, startpos, endpos, reason);
2300 if (*exceptionObject != NULL)
2301 PyCodec_StrictErrors(*exceptionObject);
2302}
2303
2304/* error handling callback helper:
2305 build arguments, call the callback and check the arguments,
2306 put the result into newpos and return the replacement string, which
2307 has to be freed by the caller */
2308static PyObject *unicode_encode_call_errorhandler(const char *errors,
2309 PyObject **errorHandler,
2310 const char *encoding, const char *reason,
2311 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2312 int startpos, int endpos,
2313 int *newpos)
2314{
2315 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2316
2317 PyObject *restuple;
2318 PyObject *resunicode;
2319
2320 if (*errorHandler == NULL) {
2321 *errorHandler = PyCodec_LookupError(errors);
2322 if (*errorHandler == NULL)
2323 return NULL;
2324 }
2325
2326 make_encode_exception(exceptionObject,
2327 encoding, unicode, size, startpos, endpos, reason);
2328 if (*exceptionObject == NULL)
2329 return NULL;
2330
2331 restuple = PyObject_CallFunctionObjArgs(
2332 *errorHandler, *exceptionObject, NULL);
2333 if (restuple == NULL)
2334 return NULL;
2335 if (!PyTuple_Check(restuple)) {
2336 PyErr_Format(PyExc_TypeError, &argparse[4]);
2337 Py_DECREF(restuple);
2338 return NULL;
2339 }
2340 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2341 &resunicode, newpos)) {
2342 Py_DECREF(restuple);
2343 return NULL;
2344 }
2345 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002346 *newpos = size+*newpos;
2347 if (*newpos<0 || *newpos>size) {
2348 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2349 Py_DECREF(restuple);
2350 return NULL;
2351 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002352 Py_INCREF(resunicode);
2353 Py_DECREF(restuple);
2354 return resunicode;
2355}
2356
2357static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2358 int size,
2359 const char *errors,
2360 int limit)
2361{
2362 /* output object */
2363 PyObject *res;
2364 /* pointers to the beginning and end+1 of input */
2365 const Py_UNICODE *startp = p;
2366 const Py_UNICODE *endp = p + size;
2367 /* pointer to the beginning of the unencodable characters */
2368 /* const Py_UNICODE *badp = NULL; */
2369 /* pointer into the output */
2370 char *str;
2371 /* current output position */
2372 int respos = 0;
2373 int ressize;
2374 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2375 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2376 PyObject *errorHandler = NULL;
2377 PyObject *exc = NULL;
2378 /* the following variable is used for caching string comparisons
2379 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2380 int known_errorHandler = -1;
2381
2382 /* allocate enough for a simple encoding without
2383 replacements, if we need more, we'll resize */
2384 res = PyString_FromStringAndSize(NULL, size);
2385 if (res == NULL)
2386 goto onError;
2387 if (size == 0)
2388 return res;
2389 str = PyString_AS_STRING(res);
2390 ressize = size;
2391
2392 while (p<endp) {
2393 Py_UNICODE c = *p;
2394
2395 /* can we encode this? */
2396 if (c<limit) {
2397 /* no overflow check, because we know that the space is enough */
2398 *str++ = (char)c;
2399 ++p;
2400 }
2401 else {
2402 int unicodepos = p-startp;
2403 int requiredsize;
2404 PyObject *repunicode;
2405 int repsize;
2406 int newpos;
2407 int respos;
2408 Py_UNICODE *uni2;
2409 /* startpos for collecting unencodable chars */
2410 const Py_UNICODE *collstart = p;
2411 const Py_UNICODE *collend = p;
2412 /* find all unecodable characters */
2413 while ((collend < endp) && ((*collend)>=limit))
2414 ++collend;
2415 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2416 if (known_errorHandler==-1) {
2417 if ((errors==NULL) || (!strcmp(errors, "strict")))
2418 known_errorHandler = 1;
2419 else if (!strcmp(errors, "replace"))
2420 known_errorHandler = 2;
2421 else if (!strcmp(errors, "ignore"))
2422 known_errorHandler = 3;
2423 else if (!strcmp(errors, "xmlcharrefreplace"))
2424 known_errorHandler = 4;
2425 else
2426 known_errorHandler = 0;
2427 }
2428 switch (known_errorHandler) {
2429 case 1: /* strict */
2430 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2431 goto onError;
2432 case 2: /* replace */
2433 while (collstart++<collend)
2434 *str++ = '?'; /* fall through */
2435 case 3: /* ignore */
2436 p = collend;
2437 break;
2438 case 4: /* xmlcharrefreplace */
2439 respos = str-PyString_AS_STRING(res);
2440 /* determine replacement size (temporarily (mis)uses p) */
2441 for (p = collstart, repsize = 0; p < collend; ++p) {
2442 if (*p<10)
2443 repsize += 2+1+1;
2444 else if (*p<100)
2445 repsize += 2+2+1;
2446 else if (*p<1000)
2447 repsize += 2+3+1;
2448 else if (*p<10000)
2449 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002450#ifndef Py_UNICODE_WIDE
2451 else
2452 repsize += 2+5+1;
2453#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002454 else if (*p<100000)
2455 repsize += 2+5+1;
2456 else if (*p<1000000)
2457 repsize += 2+6+1;
2458 else
2459 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002460#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002461 }
2462 requiredsize = respos+repsize+(endp-collend);
2463 if (requiredsize > ressize) {
2464 if (requiredsize<2*ressize)
2465 requiredsize = 2*ressize;
2466 if (_PyString_Resize(&res, requiredsize))
2467 goto onError;
2468 str = PyString_AS_STRING(res) + respos;
2469 ressize = requiredsize;
2470 }
2471 /* generate replacement (temporarily (mis)uses p) */
2472 for (p = collstart; p < collend; ++p) {
2473 str += sprintf(str, "&#%d;", (int)*p);
2474 }
2475 p = collend;
2476 break;
2477 default:
2478 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2479 encoding, reason, startp, size, &exc,
2480 collstart-startp, collend-startp, &newpos);
2481 if (repunicode == NULL)
2482 goto onError;
2483 /* need more space? (at least enough for what we
2484 have+the replacement+the rest of the string, so
2485 we won't have to check space for encodable characters) */
2486 respos = str-PyString_AS_STRING(res);
2487 repsize = PyUnicode_GET_SIZE(repunicode);
2488 requiredsize = respos+repsize+(endp-collend);
2489 if (requiredsize > ressize) {
2490 if (requiredsize<2*ressize)
2491 requiredsize = 2*ressize;
2492 if (_PyString_Resize(&res, requiredsize)) {
2493 Py_DECREF(repunicode);
2494 goto onError;
2495 }
2496 str = PyString_AS_STRING(res) + respos;
2497 ressize = requiredsize;
2498 }
2499 /* check if there is anything unencodable in the replacement
2500 and copy it to the output */
2501 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2502 c = *uni2;
2503 if (c >= limit) {
2504 raise_encode_exception(&exc, encoding, startp, size,
2505 unicodepos, unicodepos+1, reason);
2506 Py_DECREF(repunicode);
2507 goto onError;
2508 }
2509 *str = (char)c;
2510 }
2511 p = startp + newpos;
2512 Py_DECREF(repunicode);
2513 }
2514 }
2515 }
2516 /* Resize if we allocated to much */
2517 respos = str-PyString_AS_STRING(res);
2518 if (respos<ressize)
2519 /* If this falls res will be NULL */
2520 _PyString_Resize(&res, respos);
2521 Py_XDECREF(errorHandler);
2522 Py_XDECREF(exc);
2523 return res;
2524
2525 onError:
2526 Py_XDECREF(res);
2527 Py_XDECREF(errorHandler);
2528 Py_XDECREF(exc);
2529 return NULL;
2530}
2531
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2533 int size,
2534 const char *errors)
2535{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002536 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537}
2538
2539PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2540{
2541 if (!PyUnicode_Check(unicode)) {
2542 PyErr_BadArgument();
2543 return NULL;
2544 }
2545 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2546 PyUnicode_GET_SIZE(unicode),
2547 NULL);
2548}
2549
2550/* --- 7-bit ASCII Codec -------------------------------------------------- */
2551
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552PyObject *PyUnicode_DecodeASCII(const char *s,
2553 int size,
2554 const char *errors)
2555{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002556 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 PyUnicodeObject *v;
2558 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002559 int startinpos;
2560 int endinpos;
2561 int outpos;
2562 const char *e;
2563 PyObject *errorHandler = NULL;
2564 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002565
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002567 if (size == 1 && *(unsigned char*)s < 128) {
2568 Py_UNICODE r = *(unsigned char*)s;
2569 return PyUnicode_FromUnicode(&r, 1);
2570 }
Tim Petersced69f82003-09-16 20:30:58 +00002571
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572 v = _PyUnicode_New(size);
2573 if (v == NULL)
2574 goto onError;
2575 if (size == 0)
2576 return (PyObject *)v;
2577 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002578 e = s + size;
2579 while (s < e) {
2580 register unsigned char c = (unsigned char)*s;
2581 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002583 ++s;
2584 }
2585 else {
2586 startinpos = s-starts;
2587 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002588 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002589 if (unicode_decode_call_errorhandler(
2590 errors, &errorHandler,
2591 "ascii", "ordinal not in range(128)",
2592 starts, size, &startinpos, &endinpos, &exc, &s,
2593 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002597 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002598 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002599 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002600 Py_XDECREF(errorHandler);
2601 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002603
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 onError:
2605 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002606 Py_XDECREF(errorHandler);
2607 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608 return NULL;
2609}
2610
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2612 int size,
2613 const char *errors)
2614{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002615 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616}
2617
2618PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2619{
2620 if (!PyUnicode_Check(unicode)) {
2621 PyErr_BadArgument();
2622 return NULL;
2623 }
2624 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2625 PyUnicode_GET_SIZE(unicode),
2626 NULL);
2627}
2628
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002629#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002630
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002631/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002632
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002633PyObject *PyUnicode_DecodeMBCS(const char *s,
2634 int size,
2635 const char *errors)
2636{
2637 PyUnicodeObject *v;
2638 Py_UNICODE *p;
2639
2640 /* First get the size of the result */
2641 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002642 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002643 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2644
2645 v = _PyUnicode_New(usize);
2646 if (v == NULL)
2647 return NULL;
2648 if (usize == 0)
2649 return (PyObject *)v;
2650 p = PyUnicode_AS_UNICODE(v);
2651 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2652 Py_DECREF(v);
2653 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2654 }
2655
2656 return (PyObject *)v;
2657}
2658
2659PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2660 int size,
2661 const char *errors)
2662{
2663 PyObject *repr;
2664 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002665 DWORD mbcssize;
2666
2667 /* If there are no characters, bail now! */
2668 if (size==0)
2669 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002670
2671 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002672 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002673 if (mbcssize==0)
2674 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2675
2676 repr = PyString_FromStringAndSize(NULL, mbcssize);
2677 if (repr == NULL)
2678 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002679 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002680 return repr;
2681
2682 /* Do the conversion */
2683 s = PyString_AS_STRING(repr);
2684 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2685 Py_DECREF(repr);
2686 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2687 }
2688 return repr;
2689}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002690
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002691PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2692{
2693 if (!PyUnicode_Check(unicode)) {
2694 PyErr_BadArgument();
2695 return NULL;
2696 }
2697 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2698 PyUnicode_GET_SIZE(unicode),
2699 NULL);
2700}
2701
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002702#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002703
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704/* --- Character Mapping Codec -------------------------------------------- */
2705
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706PyObject *PyUnicode_DecodeCharmap(const char *s,
2707 int size,
2708 PyObject *mapping,
2709 const char *errors)
2710{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002711 const char *starts = s;
2712 int startinpos;
2713 int endinpos;
2714 int outpos;
2715 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716 PyUnicodeObject *v;
2717 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002718 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002719 PyObject *errorHandler = NULL;
2720 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002721
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722 /* Default to Latin-1 */
2723 if (mapping == NULL)
2724 return PyUnicode_DecodeLatin1(s, size, errors);
2725
2726 v = _PyUnicode_New(size);
2727 if (v == NULL)
2728 goto onError;
2729 if (size == 0)
2730 return (PyObject *)v;
2731 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002732 e = s + size;
2733 while (s < e) {
2734 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 PyObject *w, *x;
2736
2737 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2738 w = PyInt_FromLong((long)ch);
2739 if (w == NULL)
2740 goto onError;
2741 x = PyObject_GetItem(mapping, w);
2742 Py_DECREF(w);
2743 if (x == NULL) {
2744 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002745 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002747 x = Py_None;
2748 Py_INCREF(x);
2749 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002750 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 }
2752
2753 /* Apply mapping */
2754 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002755 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 if (value < 0 || value > 65535) {
2757 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002758 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 Py_DECREF(x);
2760 goto onError;
2761 }
2762 *p++ = (Py_UNICODE)value;
2763 }
2764 else if (x == Py_None) {
2765 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002766 outpos = p-PyUnicode_AS_UNICODE(v);
2767 startinpos = s-starts;
2768 endinpos = startinpos+1;
2769 if (unicode_decode_call_errorhandler(
2770 errors, &errorHandler,
2771 "charmap", "character maps to <undefined>",
2772 starts, size, &startinpos, &endinpos, &exc, &s,
2773 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774 Py_DECREF(x);
2775 goto onError;
2776 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002777 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778 }
2779 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002780 int targetsize = PyUnicode_GET_SIZE(x);
2781
2782 if (targetsize == 1)
2783 /* 1-1 mapping */
2784 *p++ = *PyUnicode_AS_UNICODE(x);
2785
2786 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002788 if (targetsize > extrachars) {
2789 /* resize first */
2790 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2791 int needed = (targetsize - extrachars) + \
2792 (targetsize << 2);
2793 extrachars += needed;
Tim Petersced69f82003-09-16 20:30:58 +00002794 if (_PyUnicode_Resize(&v,
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002795 PyUnicode_GET_SIZE(v) + needed) < 0) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002796 Py_DECREF(x);
2797 goto onError;
2798 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002799 p = PyUnicode_AS_UNICODE(v) + oldpos;
2800 }
2801 Py_UNICODE_COPY(p,
2802 PyUnicode_AS_UNICODE(x),
2803 targetsize);
2804 p += targetsize;
2805 extrachars -= targetsize;
2806 }
2807 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 }
2809 else {
2810 /* wrong return value */
2811 PyErr_SetString(PyExc_TypeError,
2812 "character mapping must return integer, None or unicode");
2813 Py_DECREF(x);
2814 goto onError;
2815 }
2816 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002817 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818 }
2819 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002820 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822 Py_XDECREF(errorHandler);
2823 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002825
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002827 Py_XDECREF(errorHandler);
2828 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829 Py_XDECREF(v);
2830 return NULL;
2831}
2832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002833/* Lookup the character ch in the mapping. If the character
2834 can't be found, Py_None is returned (or NULL, if another
2835 error occured). */
2836static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002838 PyObject *w = PyInt_FromLong((long)c);
2839 PyObject *x;
2840
2841 if (w == NULL)
2842 return NULL;
2843 x = PyObject_GetItem(mapping, w);
2844 Py_DECREF(w);
2845 if (x == NULL) {
2846 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2847 /* No mapping found means: mapping is undefined. */
2848 PyErr_Clear();
2849 x = Py_None;
2850 Py_INCREF(x);
2851 return x;
2852 } else
2853 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002855 else if (x == Py_None)
2856 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002857 else if (PyInt_Check(x)) {
2858 long value = PyInt_AS_LONG(x);
2859 if (value < 0 || value > 255) {
2860 PyErr_SetString(PyExc_TypeError,
2861 "character mapping must be in range(256)");
2862 Py_DECREF(x);
2863 return NULL;
2864 }
2865 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002867 else if (PyString_Check(x))
2868 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002870 /* wrong return value */
2871 PyErr_SetString(PyExc_TypeError,
2872 "character mapping must return integer, None or str");
2873 Py_DECREF(x);
2874 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 }
2876}
2877
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002878/* lookup the character, put the result in the output string and adjust
2879 various state variables. Reallocate the output string if not enough
2880 space is available. Return a new reference to the object that
2881 was put in the output buffer, or Py_None, if the mapping was undefined
2882 (in which case no character was written) or NULL, if a
2883 reallocation error ocurred. The called must decref the result */
2884static
2885PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2886 PyObject **outobj, int *outpos)
2887{
2888 PyObject *rep = charmapencode_lookup(c, mapping);
2889
2890 if (rep==NULL)
2891 return NULL;
2892 else if (rep==Py_None)
2893 return rep;
2894 else {
2895 char *outstart = PyString_AS_STRING(*outobj);
2896 int outsize = PyString_GET_SIZE(*outobj);
2897 if (PyInt_Check(rep)) {
2898 int requiredsize = *outpos+1;
2899 if (outsize<requiredsize) {
2900 /* exponentially overallocate to minimize reallocations */
2901 if (requiredsize < 2*outsize)
2902 requiredsize = 2*outsize;
2903 if (_PyString_Resize(outobj, requiredsize)) {
2904 Py_DECREF(rep);
2905 return NULL;
2906 }
2907 outstart = PyString_AS_STRING(*outobj);
2908 }
2909 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2910 }
2911 else {
2912 const char *repchars = PyString_AS_STRING(rep);
2913 int repsize = PyString_GET_SIZE(rep);
2914 int requiredsize = *outpos+repsize;
2915 if (outsize<requiredsize) {
2916 /* exponentially overallocate to minimize reallocations */
2917 if (requiredsize < 2*outsize)
2918 requiredsize = 2*outsize;
2919 if (_PyString_Resize(outobj, requiredsize)) {
2920 Py_DECREF(rep);
2921 return NULL;
2922 }
2923 outstart = PyString_AS_STRING(*outobj);
2924 }
2925 memcpy(outstart + *outpos, repchars, repsize);
2926 *outpos += repsize;
2927 }
2928 }
2929 return rep;
2930}
2931
2932/* handle an error in PyUnicode_EncodeCharmap
2933 Return 0 on success, -1 on error */
2934static
2935int charmap_encoding_error(
2936 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2937 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002938 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002939 PyObject **res, int *respos)
2940{
2941 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2942 int repsize;
2943 int newpos;
2944 Py_UNICODE *uni2;
2945 /* startpos for collecting unencodable chars */
2946 int collstartpos = *inpos;
2947 int collendpos = *inpos+1;
2948 int collpos;
2949 char *encoding = "charmap";
2950 char *reason = "character maps to <undefined>";
2951
2952 PyObject *x;
2953 /* find all unencodable characters */
2954 while (collendpos < size) {
2955 x = charmapencode_lookup(p[collendpos], mapping);
2956 if (x==NULL)
2957 return -1;
2958 else if (x!=Py_None) {
2959 Py_DECREF(x);
2960 break;
2961 }
2962 Py_DECREF(x);
2963 ++collendpos;
2964 }
2965 /* cache callback name lookup
2966 * (if not done yet, i.e. it's the first error) */
2967 if (*known_errorHandler==-1) {
2968 if ((errors==NULL) || (!strcmp(errors, "strict")))
2969 *known_errorHandler = 1;
2970 else if (!strcmp(errors, "replace"))
2971 *known_errorHandler = 2;
2972 else if (!strcmp(errors, "ignore"))
2973 *known_errorHandler = 3;
2974 else if (!strcmp(errors, "xmlcharrefreplace"))
2975 *known_errorHandler = 4;
2976 else
2977 *known_errorHandler = 0;
2978 }
2979 switch (*known_errorHandler) {
2980 case 1: /* strict */
2981 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2982 return -1;
2983 case 2: /* replace */
2984 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2985 x = charmapencode_output('?', mapping, res, respos);
2986 if (x==NULL) {
2987 return -1;
2988 }
2989 else if (x==Py_None) {
2990 Py_DECREF(x);
2991 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2992 return -1;
2993 }
2994 Py_DECREF(x);
2995 }
2996 /* fall through */
2997 case 3: /* ignore */
2998 *inpos = collendpos;
2999 break;
3000 case 4: /* xmlcharrefreplace */
3001 /* generate replacement (temporarily (mis)uses p) */
3002 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3003 char buffer[2+29+1+1];
3004 char *cp;
3005 sprintf(buffer, "&#%d;", (int)p[collpos]);
3006 for (cp = buffer; *cp; ++cp) {
3007 x = charmapencode_output(*cp, mapping, res, respos);
3008 if (x==NULL)
3009 return -1;
3010 else if (x==Py_None) {
3011 Py_DECREF(x);
3012 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3013 return -1;
3014 }
3015 Py_DECREF(x);
3016 }
3017 }
3018 *inpos = collendpos;
3019 break;
3020 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003021 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003022 encoding, reason, p, size, exceptionObject,
3023 collstartpos, collendpos, &newpos);
3024 if (repunicode == NULL)
3025 return -1;
3026 /* generate replacement */
3027 repsize = PyUnicode_GET_SIZE(repunicode);
3028 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3029 x = charmapencode_output(*uni2, mapping, res, respos);
3030 if (x==NULL) {
3031 Py_DECREF(repunicode);
3032 return -1;
3033 }
3034 else if (x==Py_None) {
3035 Py_DECREF(repunicode);
3036 Py_DECREF(x);
3037 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3038 return -1;
3039 }
3040 Py_DECREF(x);
3041 }
3042 *inpos = newpos;
3043 Py_DECREF(repunicode);
3044 }
3045 return 0;
3046}
3047
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3049 int size,
3050 PyObject *mapping,
3051 const char *errors)
3052{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 /* output object */
3054 PyObject *res = NULL;
3055 /* current input position */
3056 int inpos = 0;
3057 /* current output position */
3058 int respos = 0;
3059 PyObject *errorHandler = NULL;
3060 PyObject *exc = NULL;
3061 /* the following variable is used for caching string comparisons
3062 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3063 * 3=ignore, 4=xmlcharrefreplace */
3064 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065
3066 /* Default to Latin-1 */
3067 if (mapping == NULL)
3068 return PyUnicode_EncodeLatin1(p, size, errors);
3069
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070 /* allocate enough for a simple encoding without
3071 replacements, if we need more, we'll resize */
3072 res = PyString_FromStringAndSize(NULL, size);
3073 if (res == NULL)
3074 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003075 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003076 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003077
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003078 while (inpos<size) {
3079 /* try to encode it */
3080 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3081 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003083 if (x==Py_None) { /* unencodable character */
3084 if (charmap_encoding_error(p, size, &inpos, mapping,
3085 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003086 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003087 &res, &respos)) {
3088 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003089 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003092 else
3093 /* done with this character => adjust input position */
3094 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095 Py_DECREF(x);
3096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003098 /* Resize if we allocated to much */
3099 if (respos<PyString_GET_SIZE(res)) {
3100 if (_PyString_Resize(&res, respos))
3101 goto onError;
3102 }
3103 Py_XDECREF(exc);
3104 Py_XDECREF(errorHandler);
3105 return res;
3106
3107 onError:
3108 Py_XDECREF(res);
3109 Py_XDECREF(exc);
3110 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111 return NULL;
3112}
3113
3114PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3115 PyObject *mapping)
3116{
3117 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3118 PyErr_BadArgument();
3119 return NULL;
3120 }
3121 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3122 PyUnicode_GET_SIZE(unicode),
3123 mapping,
3124 NULL);
3125}
3126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003127/* create or adjust a UnicodeTranslateError */
3128static void make_translate_exception(PyObject **exceptionObject,
3129 const Py_UNICODE *unicode, int size,
3130 int startpos, int endpos,
3131 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003133 if (*exceptionObject == NULL) {
3134 *exceptionObject = PyUnicodeTranslateError_Create(
3135 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136 }
3137 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003138 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3139 goto onError;
3140 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3141 goto onError;
3142 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3143 goto onError;
3144 return;
3145 onError:
3146 Py_DECREF(*exceptionObject);
3147 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148 }
3149}
3150
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003151/* raises a UnicodeTranslateError */
3152static void raise_translate_exception(PyObject **exceptionObject,
3153 const Py_UNICODE *unicode, int size,
3154 int startpos, int endpos,
3155 const char *reason)
3156{
3157 make_translate_exception(exceptionObject,
3158 unicode, size, startpos, endpos, reason);
3159 if (*exceptionObject != NULL)
3160 PyCodec_StrictErrors(*exceptionObject);
3161}
3162
3163/* error handling callback helper:
3164 build arguments, call the callback and check the arguments,
3165 put the result into newpos and return the replacement string, which
3166 has to be freed by the caller */
3167static PyObject *unicode_translate_call_errorhandler(const char *errors,
3168 PyObject **errorHandler,
3169 const char *reason,
3170 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3171 int startpos, int endpos,
3172 int *newpos)
3173{
3174 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3175
3176 PyObject *restuple;
3177 PyObject *resunicode;
3178
3179 if (*errorHandler == NULL) {
3180 *errorHandler = PyCodec_LookupError(errors);
3181 if (*errorHandler == NULL)
3182 return NULL;
3183 }
3184
3185 make_translate_exception(exceptionObject,
3186 unicode, size, startpos, endpos, reason);
3187 if (*exceptionObject == NULL)
3188 return NULL;
3189
3190 restuple = PyObject_CallFunctionObjArgs(
3191 *errorHandler, *exceptionObject, NULL);
3192 if (restuple == NULL)
3193 return NULL;
3194 if (!PyTuple_Check(restuple)) {
3195 PyErr_Format(PyExc_TypeError, &argparse[4]);
3196 Py_DECREF(restuple);
3197 return NULL;
3198 }
3199 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3200 &resunicode, newpos)) {
3201 Py_DECREF(restuple);
3202 return NULL;
3203 }
3204 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003205 *newpos = size+*newpos;
3206 if (*newpos<0 || *newpos>size) {
3207 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3208 Py_DECREF(restuple);
3209 return NULL;
3210 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003211 Py_INCREF(resunicode);
3212 Py_DECREF(restuple);
3213 return resunicode;
3214}
3215
3216/* Lookup the character ch in the mapping and put the result in result,
3217 which must be decrefed by the caller.
3218 Return 0 on success, -1 on error */
3219static
3220int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3221{
3222 PyObject *w = PyInt_FromLong((long)c);
3223 PyObject *x;
3224
3225 if (w == NULL)
3226 return -1;
3227 x = PyObject_GetItem(mapping, w);
3228 Py_DECREF(w);
3229 if (x == NULL) {
3230 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3231 /* No mapping found means: use 1:1 mapping. */
3232 PyErr_Clear();
3233 *result = NULL;
3234 return 0;
3235 } else
3236 return -1;
3237 }
3238 else if (x == Py_None) {
3239 *result = x;
3240 return 0;
3241 }
3242 else if (PyInt_Check(x)) {
3243 long value = PyInt_AS_LONG(x);
3244 long max = PyUnicode_GetMax();
3245 if (value < 0 || value > max) {
3246 PyErr_Format(PyExc_TypeError,
3247 "character mapping must be in range(0x%lx)", max+1);
3248 Py_DECREF(x);
3249 return -1;
3250 }
3251 *result = x;
3252 return 0;
3253 }
3254 else if (PyUnicode_Check(x)) {
3255 *result = x;
3256 return 0;
3257 }
3258 else {
3259 /* wrong return value */
3260 PyErr_SetString(PyExc_TypeError,
3261 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003262 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003263 return -1;
3264 }
3265}
3266/* ensure that *outobj is at least requiredsize characters long,
3267if not reallocate and adjust various state variables.
3268Return 0 on success, -1 on error */
3269static
Walter Dörwald4894c302003-10-24 14:25:28 +00003270int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003271 int requiredsize)
3272{
Walter Dörwald4894c302003-10-24 14:25:28 +00003273 int oldsize = PyUnicode_GET_SIZE(*outobj);
3274 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003275 /* remember old output position */
3276 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3277 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003278 if (requiredsize < 2 * oldsize)
3279 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003280 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003281 return -1;
3282 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003283 }
3284 return 0;
3285}
3286/* lookup the character, put the result in the output string and adjust
3287 various state variables. Return a new reference to the object that
3288 was put in the output buffer in *result, or Py_None, if the mapping was
3289 undefined (in which case no character was written).
3290 The called must decref result.
3291 Return 0 on success, -1 on error. */
3292static
Walter Dörwald4894c302003-10-24 14:25:28 +00003293int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3294 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3295 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296{
Walter Dörwald4894c302003-10-24 14:25:28 +00003297 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003298 return -1;
3299 if (*res==NULL) {
3300 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003301 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003302 }
3303 else if (*res==Py_None)
3304 ;
3305 else if (PyInt_Check(*res)) {
3306 /* no overflow check, because we know that the space is enough */
3307 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3308 }
3309 else if (PyUnicode_Check(*res)) {
3310 int repsize = PyUnicode_GET_SIZE(*res);
3311 if (repsize==1) {
3312 /* no overflow check, because we know that the space is enough */
3313 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3314 }
3315 else if (repsize!=0) {
3316 /* more than one character */
Walter Dörwald4894c302003-10-24 14:25:28 +00003317 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003318 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003319 repsize - 1;
3320 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003321 return -1;
3322 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3323 *outp += repsize;
3324 }
3325 }
3326 else
3327 return -1;
3328 return 0;
3329}
3330
3331PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 int size,
3333 PyObject *mapping,
3334 const char *errors)
3335{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003336 /* output object */
3337 PyObject *res = NULL;
3338 /* pointers to the beginning and end+1 of input */
3339 const Py_UNICODE *startp = p;
3340 const Py_UNICODE *endp = p + size;
3341 /* pointer into the output */
3342 Py_UNICODE *str;
3343 /* current output position */
3344 int respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003345 char *reason = "character maps to <undefined>";
3346 PyObject *errorHandler = NULL;
3347 PyObject *exc = NULL;
3348 /* the following variable is used for caching string comparisons
3349 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3350 * 3=ignore, 4=xmlcharrefreplace */
3351 int known_errorHandler = -1;
3352
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 if (mapping == NULL) {
3354 PyErr_BadArgument();
3355 return NULL;
3356 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003357
3358 /* allocate enough for a simple 1:1 translation without
3359 replacements, if we need more, we'll resize */
3360 res = PyUnicode_FromUnicode(NULL, size);
3361 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003362 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003363 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003364 return res;
3365 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003367 while (p<endp) {
3368 /* try to encode it */
3369 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003370 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003371 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372 goto onError;
3373 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003374 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003375 if (x!=Py_None) /* it worked => adjust input pointer */
3376 ++p;
3377 else { /* untranslatable character */
3378 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3379 int repsize;
3380 int newpos;
3381 Py_UNICODE *uni2;
3382 /* startpos for collecting untranslatable chars */
3383 const Py_UNICODE *collstart = p;
3384 const Py_UNICODE *collend = p+1;
3385 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003386
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003387 /* find all untranslatable characters */
3388 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003389 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390 goto onError;
3391 Py_XDECREF(x);
3392 if (x!=Py_None)
3393 break;
3394 ++collend;
3395 }
3396 /* cache callback name lookup
3397 * (if not done yet, i.e. it's the first error) */
3398 if (known_errorHandler==-1) {
3399 if ((errors==NULL) || (!strcmp(errors, "strict")))
3400 known_errorHandler = 1;
3401 else if (!strcmp(errors, "replace"))
3402 known_errorHandler = 2;
3403 else if (!strcmp(errors, "ignore"))
3404 known_errorHandler = 3;
3405 else if (!strcmp(errors, "xmlcharrefreplace"))
3406 known_errorHandler = 4;
3407 else
3408 known_errorHandler = 0;
3409 }
3410 switch (known_errorHandler) {
3411 case 1: /* strict */
3412 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3413 goto onError;
3414 case 2: /* replace */
3415 /* No need to check for space, this is a 1:1 replacement */
3416 for (coll = collstart; coll<collend; ++coll)
3417 *str++ = '?';
3418 /* fall through */
3419 case 3: /* ignore */
3420 p = collend;
3421 break;
3422 case 4: /* xmlcharrefreplace */
3423 /* generate replacement (temporarily (mis)uses p) */
3424 for (p = collstart; p < collend; ++p) {
3425 char buffer[2+29+1+1];
3426 char *cp;
3427 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003428 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3430 goto onError;
3431 for (cp = buffer; *cp; ++cp)
3432 *str++ = *cp;
3433 }
3434 p = collend;
3435 break;
3436 default:
3437 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3438 reason, startp, size, &exc,
3439 collstart-startp, collend-startp, &newpos);
3440 if (repunicode == NULL)
3441 goto onError;
3442 /* generate replacement */
3443 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003444 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3446 Py_DECREF(repunicode);
3447 goto onError;
3448 }
3449 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3450 *str++ = *uni2;
3451 p = startp + newpos;
3452 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003453 }
3454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003456 /* Resize if we allocated to much */
3457 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003458 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003459 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003460 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003461 }
3462 Py_XDECREF(exc);
3463 Py_XDECREF(errorHandler);
3464 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466 onError:
3467 Py_XDECREF(res);
3468 Py_XDECREF(exc);
3469 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 return NULL;
3471}
3472
3473PyObject *PyUnicode_Translate(PyObject *str,
3474 PyObject *mapping,
3475 const char *errors)
3476{
3477 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003478
Guido van Rossumd57fd912000-03-10 22:53:23 +00003479 str = PyUnicode_FromObject(str);
3480 if (str == NULL)
3481 goto onError;
3482 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3483 PyUnicode_GET_SIZE(str),
3484 mapping,
3485 errors);
3486 Py_DECREF(str);
3487 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003488
Guido van Rossumd57fd912000-03-10 22:53:23 +00003489 onError:
3490 Py_XDECREF(str);
3491 return NULL;
3492}
Tim Petersced69f82003-09-16 20:30:58 +00003493
Guido van Rossum9e896b32000-04-05 20:11:21 +00003494/* --- Decimal Encoder ---------------------------------------------------- */
3495
3496int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3497 int length,
3498 char *output,
3499 const char *errors)
3500{
3501 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502 PyObject *errorHandler = NULL;
3503 PyObject *exc = NULL;
3504 const char *encoding = "decimal";
3505 const char *reason = "invalid decimal Unicode string";
3506 /* the following variable is used for caching string comparisons
3507 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3508 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003509
3510 if (output == NULL) {
3511 PyErr_BadArgument();
3512 return -1;
3513 }
3514
3515 p = s;
3516 end = s + length;
3517 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003519 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520 PyObject *repunicode;
3521 int repsize;
3522 int newpos;
3523 Py_UNICODE *uni2;
3524 Py_UNICODE *collstart;
3525 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003526
Guido van Rossum9e896b32000-04-05 20:11:21 +00003527 if (Py_UNICODE_ISSPACE(ch)) {
3528 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003530 continue;
3531 }
3532 decimal = Py_UNICODE_TODECIMAL(ch);
3533 if (decimal >= 0) {
3534 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003536 continue;
3537 }
Guido van Rossumba477042000-04-06 18:18:10 +00003538 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003539 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003540 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003541 continue;
3542 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 /* All other characters are considered unencodable */
3544 collstart = p;
3545 collend = p+1;
3546 while (collend < end) {
3547 if ((0 < *collend && *collend < 256) ||
3548 !Py_UNICODE_ISSPACE(*collend) ||
3549 Py_UNICODE_TODECIMAL(*collend))
3550 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003551 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003552 /* cache callback name lookup
3553 * (if not done yet, i.e. it's the first error) */
3554 if (known_errorHandler==-1) {
3555 if ((errors==NULL) || (!strcmp(errors, "strict")))
3556 known_errorHandler = 1;
3557 else if (!strcmp(errors, "replace"))
3558 known_errorHandler = 2;
3559 else if (!strcmp(errors, "ignore"))
3560 known_errorHandler = 3;
3561 else if (!strcmp(errors, "xmlcharrefreplace"))
3562 known_errorHandler = 4;
3563 else
3564 known_errorHandler = 0;
3565 }
3566 switch (known_errorHandler) {
3567 case 1: /* strict */
3568 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3569 goto onError;
3570 case 2: /* replace */
3571 for (p = collstart; p < collend; ++p)
3572 *output++ = '?';
3573 /* fall through */
3574 case 3: /* ignore */
3575 p = collend;
3576 break;
3577 case 4: /* xmlcharrefreplace */
3578 /* generate replacement (temporarily (mis)uses p) */
3579 for (p = collstart; p < collend; ++p)
3580 output += sprintf(output, "&#%d;", (int)*p);
3581 p = collend;
3582 break;
3583 default:
3584 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3585 encoding, reason, s, length, &exc,
3586 collstart-s, collend-s, &newpos);
3587 if (repunicode == NULL)
3588 goto onError;
3589 /* generate replacement */
3590 repsize = PyUnicode_GET_SIZE(repunicode);
3591 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3592 Py_UNICODE ch = *uni2;
3593 if (Py_UNICODE_ISSPACE(ch))
3594 *output++ = ' ';
3595 else {
3596 decimal = Py_UNICODE_TODECIMAL(ch);
3597 if (decimal >= 0)
3598 *output++ = '0' + decimal;
3599 else if (0 < ch && ch < 256)
3600 *output++ = (char)ch;
3601 else {
3602 Py_DECREF(repunicode);
3603 raise_encode_exception(&exc, encoding,
3604 s, length, collstart-s, collend-s, reason);
3605 goto onError;
3606 }
3607 }
3608 }
3609 p = s + newpos;
3610 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003611 }
3612 }
3613 /* 0-terminate the output string */
3614 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003615 Py_XDECREF(exc);
3616 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003617 return 0;
3618
3619 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 Py_XDECREF(exc);
3621 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003622 return -1;
3623}
3624
Guido van Rossumd57fd912000-03-10 22:53:23 +00003625/* --- Helpers ------------------------------------------------------------ */
3626
Tim Petersced69f82003-09-16 20:30:58 +00003627static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628int count(PyUnicodeObject *self,
3629 int start,
3630 int end,
3631 PyUnicodeObject *substring)
3632{
3633 int count = 0;
3634
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003635 if (start < 0)
3636 start += self->length;
3637 if (start < 0)
3638 start = 0;
3639 if (end > self->length)
3640 end = self->length;
3641 if (end < 0)
3642 end += self->length;
3643 if (end < 0)
3644 end = 0;
3645
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003646 if (substring->length == 0)
3647 return (end - start + 1);
3648
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649 end -= substring->length;
3650
3651 while (start <= end)
3652 if (Py_UNICODE_MATCH(self, start, substring)) {
3653 count++;
3654 start += substring->length;
3655 } else
3656 start++;
3657
3658 return count;
3659}
3660
3661int PyUnicode_Count(PyObject *str,
3662 PyObject *substr,
3663 int start,
3664 int end)
3665{
3666 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003667
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 str = PyUnicode_FromObject(str);
3669 if (str == NULL)
3670 return -1;
3671 substr = PyUnicode_FromObject(substr);
3672 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003673 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674 return -1;
3675 }
Tim Petersced69f82003-09-16 20:30:58 +00003676
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 result = count((PyUnicodeObject *)str,
3678 start, end,
3679 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003680
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681 Py_DECREF(str);
3682 Py_DECREF(substr);
3683 return result;
3684}
3685
Tim Petersced69f82003-09-16 20:30:58 +00003686static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687int findstring(PyUnicodeObject *self,
3688 PyUnicodeObject *substring,
3689 int start,
3690 int end,
3691 int direction)
3692{
3693 if (start < 0)
3694 start += self->length;
3695 if (start < 0)
3696 start = 0;
3697
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698 if (end > self->length)
3699 end = self->length;
3700 if (end < 0)
3701 end += self->length;
3702 if (end < 0)
3703 end = 0;
3704
Guido van Rossum76afbd92002-08-20 17:29:29 +00003705 if (substring->length == 0)
3706 return (direction > 0) ? start : end;
3707
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 end -= substring->length;
3709
3710 if (direction < 0) {
3711 for (; end >= start; end--)
3712 if (Py_UNICODE_MATCH(self, end, substring))
3713 return end;
3714 } else {
3715 for (; start <= end; start++)
3716 if (Py_UNICODE_MATCH(self, start, substring))
3717 return start;
3718 }
3719
3720 return -1;
3721}
3722
3723int PyUnicode_Find(PyObject *str,
3724 PyObject *substr,
3725 int start,
3726 int end,
3727 int direction)
3728{
3729 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003730
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 str = PyUnicode_FromObject(str);
3732 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003733 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003734 substr = PyUnicode_FromObject(substr);
3735 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003736 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003737 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738 }
Tim Petersced69f82003-09-16 20:30:58 +00003739
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740 result = findstring((PyUnicodeObject *)str,
3741 (PyUnicodeObject *)substr,
3742 start, end, direction);
3743 Py_DECREF(str);
3744 Py_DECREF(substr);
3745 return result;
3746}
3747
Tim Petersced69f82003-09-16 20:30:58 +00003748static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749int tailmatch(PyUnicodeObject *self,
3750 PyUnicodeObject *substring,
3751 int start,
3752 int end,
3753 int direction)
3754{
3755 if (start < 0)
3756 start += self->length;
3757 if (start < 0)
3758 start = 0;
3759
3760 if (substring->length == 0)
3761 return 1;
3762
3763 if (end > self->length)
3764 end = self->length;
3765 if (end < 0)
3766 end += self->length;
3767 if (end < 0)
3768 end = 0;
3769
3770 end -= substring->length;
3771 if (end < start)
3772 return 0;
3773
3774 if (direction > 0) {
3775 if (Py_UNICODE_MATCH(self, end, substring))
3776 return 1;
3777 } else {
3778 if (Py_UNICODE_MATCH(self, start, substring))
3779 return 1;
3780 }
3781
3782 return 0;
3783}
3784
3785int PyUnicode_Tailmatch(PyObject *str,
3786 PyObject *substr,
3787 int start,
3788 int end,
3789 int direction)
3790{
3791 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003792
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793 str = PyUnicode_FromObject(str);
3794 if (str == NULL)
3795 return -1;
3796 substr = PyUnicode_FromObject(substr);
3797 if (substr == NULL) {
3798 Py_DECREF(substr);
3799 return -1;
3800 }
Tim Petersced69f82003-09-16 20:30:58 +00003801
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802 result = tailmatch((PyUnicodeObject *)str,
3803 (PyUnicodeObject *)substr,
3804 start, end, direction);
3805 Py_DECREF(str);
3806 Py_DECREF(substr);
3807 return result;
3808}
3809
Tim Petersced69f82003-09-16 20:30:58 +00003810static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811const Py_UNICODE *findchar(const Py_UNICODE *s,
3812 int size,
3813 Py_UNICODE ch)
3814{
3815 /* like wcschr, but doesn't stop at NULL characters */
3816
3817 while (size-- > 0) {
3818 if (*s == ch)
3819 return s;
3820 s++;
3821 }
3822
3823 return NULL;
3824}
3825
3826/* Apply fixfct filter to the Unicode object self and return a
3827 reference to the modified object */
3828
Tim Petersced69f82003-09-16 20:30:58 +00003829static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830PyObject *fixup(PyUnicodeObject *self,
3831 int (*fixfct)(PyUnicodeObject *s))
3832{
3833
3834 PyUnicodeObject *u;
3835
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003836 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 if (u == NULL)
3838 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003839
3840 Py_UNICODE_COPY(u->str, self->str, self->length);
3841
Tim Peters7a29bd52001-09-12 03:03:31 +00003842 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843 /* fixfct should return TRUE if it modified the buffer. If
3844 FALSE, return a reference to the original buffer instead
3845 (to save space, not time) */
3846 Py_INCREF(self);
3847 Py_DECREF(u);
3848 return (PyObject*) self;
3849 }
3850 return (PyObject*) u;
3851}
3852
Tim Petersced69f82003-09-16 20:30:58 +00003853static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854int fixupper(PyUnicodeObject *self)
3855{
3856 int len = self->length;
3857 Py_UNICODE *s = self->str;
3858 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003859
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860 while (len-- > 0) {
3861 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003862
Guido van Rossumd57fd912000-03-10 22:53:23 +00003863 ch = Py_UNICODE_TOUPPER(*s);
3864 if (ch != *s) {
3865 status = 1;
3866 *s = ch;
3867 }
3868 s++;
3869 }
3870
3871 return status;
3872}
3873
Tim Petersced69f82003-09-16 20:30:58 +00003874static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875int fixlower(PyUnicodeObject *self)
3876{
3877 int len = self->length;
3878 Py_UNICODE *s = self->str;
3879 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003880
Guido van Rossumd57fd912000-03-10 22:53:23 +00003881 while (len-- > 0) {
3882 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003883
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884 ch = Py_UNICODE_TOLOWER(*s);
3885 if (ch != *s) {
3886 status = 1;
3887 *s = ch;
3888 }
3889 s++;
3890 }
3891
3892 return status;
3893}
3894
Tim Petersced69f82003-09-16 20:30:58 +00003895static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896int fixswapcase(PyUnicodeObject *self)
3897{
3898 int len = self->length;
3899 Py_UNICODE *s = self->str;
3900 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003901
Guido van Rossumd57fd912000-03-10 22:53:23 +00003902 while (len-- > 0) {
3903 if (Py_UNICODE_ISUPPER(*s)) {
3904 *s = Py_UNICODE_TOLOWER(*s);
3905 status = 1;
3906 } else if (Py_UNICODE_ISLOWER(*s)) {
3907 *s = Py_UNICODE_TOUPPER(*s);
3908 status = 1;
3909 }
3910 s++;
3911 }
3912
3913 return status;
3914}
3915
Tim Petersced69f82003-09-16 20:30:58 +00003916static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917int fixcapitalize(PyUnicodeObject *self)
3918{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003919 int len = self->length;
3920 Py_UNICODE *s = self->str;
3921 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003922
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003923 if (len == 0)
3924 return 0;
3925 if (Py_UNICODE_ISLOWER(*s)) {
3926 *s = Py_UNICODE_TOUPPER(*s);
3927 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003929 s++;
3930 while (--len > 0) {
3931 if (Py_UNICODE_ISUPPER(*s)) {
3932 *s = Py_UNICODE_TOLOWER(*s);
3933 status = 1;
3934 }
3935 s++;
3936 }
3937 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938}
3939
3940static
3941int fixtitle(PyUnicodeObject *self)
3942{
3943 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3944 register Py_UNICODE *e;
3945 int previous_is_cased;
3946
3947 /* Shortcut for single character strings */
3948 if (PyUnicode_GET_SIZE(self) == 1) {
3949 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3950 if (*p != ch) {
3951 *p = ch;
3952 return 1;
3953 }
3954 else
3955 return 0;
3956 }
Tim Petersced69f82003-09-16 20:30:58 +00003957
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 e = p + PyUnicode_GET_SIZE(self);
3959 previous_is_cased = 0;
3960 for (; p < e; p++) {
3961 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00003962
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963 if (previous_is_cased)
3964 *p = Py_UNICODE_TOLOWER(ch);
3965 else
3966 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00003967
3968 if (Py_UNICODE_ISLOWER(ch) ||
3969 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970 Py_UNICODE_ISTITLE(ch))
3971 previous_is_cased = 1;
3972 else
3973 previous_is_cased = 0;
3974 }
3975 return 1;
3976}
3977
Tim Peters8ce9f162004-08-27 01:49:32 +00003978PyObject *
3979PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980{
Tim Peters8ce9f162004-08-27 01:49:32 +00003981 PyObject *internal_separator = NULL;
Tim Peters05eba1f2004-08-27 21:32:02 +00003982 const Py_UNICODE *sep;
Tim Peters8ce9f162004-08-27 01:49:32 +00003983 size_t seplen;
Tim Peters05eba1f2004-08-27 21:32:02 +00003984 PyUnicodeObject *res = NULL; /* the result */
3985 size_t res_alloc = 100; /* # allocated bytes for string in res */
3986 size_t res_used; /* # used bytes */
3987 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
3988 PyObject *fseq; /* PySequence_Fast(seq) */
3989 int seqlen; /* len(fseq) -- number of items in sequence */
3990 const Py_UNICODE blank = ' ';
Tim Peters8ce9f162004-08-27 01:49:32 +00003991 PyObject *item;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992 int i;
3993
Tim Peters05eba1f2004-08-27 21:32:02 +00003994 fseq = PySequence_Fast(seq, "");
3995 if (fseq == NULL) {
3996 if (PyErr_ExceptionMatches(PyExc_TypeError))
3997 PyErr_Format(PyExc_TypeError,
3998 "sequence expected, %.80s found",
3999 seq->ob_type->tp_name);
4000 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004001 }
4002
Tim Peters05eba1f2004-08-27 21:32:02 +00004003 seqlen = PySequence_Fast_GET_SIZE(fseq);
4004 /* If empty sequence, return u"". */
4005 if (seqlen == 0) {
4006 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4007 goto Done;
4008 }
4009 /* If singleton sequence with an exact Unicode, return that. */
4010 if (seqlen == 1) {
4011 item = PySequence_Fast_GET_ITEM(fseq, 0);
4012 if (PyUnicode_CheckExact(item)) {
4013 Py_INCREF(item);
4014 res = (PyUnicodeObject *)item;
4015 goto Done;
4016 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004017 }
4018
Tim Peters05eba1f2004-08-27 21:32:02 +00004019 /* At least two items to join, or one that isn't exact Unicode. */
4020 if (seqlen > 1) {
4021 /* Set up sep and seplen -- they're needed. */
4022 if (separator == NULL) {
4023 sep = &blank;
4024 seplen = 1;
4025 }
4026 else {
4027 internal_separator = PyUnicode_FromObject(separator);
4028 if (internal_separator == NULL)
4029 goto onError;
4030 sep = PyUnicode_AS_UNICODE(internal_separator);
4031 seplen = PyUnicode_GET_SIZE(internal_separator);
4032 }
4033 }
4034
4035 /* Get space. */
4036 res = _PyUnicode_New((int)res_alloc);
4037 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004038 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004039 res_p = PyUnicode_AS_UNICODE(res);
4040 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004041
Tim Peters05eba1f2004-08-27 21:32:02 +00004042 for (i = 0; i < seqlen; ++i) {
4043 size_t itemlen;
4044 size_t new_res_used;
4045
4046 item = PySequence_Fast_GET_ITEM(fseq, i);
4047 /* Convert item to Unicode. */
4048 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4049 PyErr_Format(PyExc_TypeError,
4050 "sequence item %i: expected string or Unicode,"
4051 " %.80s found",
4052 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004053 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004054 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004055 item = PyUnicode_FromObject(item);
4056 if (item == NULL)
4057 goto onError;
4058 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004059
Tim Peters8ce9f162004-08-27 01:49:32 +00004060 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004062 new_res_used = res_used + itemlen;
4063 if (new_res_used < res_used || new_res_used > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004064 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004065 if (i < seqlen - 1) {
4066 new_res_used += seplen;
4067 if (new_res_used < res_used || new_res_used > INT_MAX)
4068 goto Overflow;
4069 }
4070 if (new_res_used > res_alloc) {
4071 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004072 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004073 size_t oldsize = res_alloc;
4074 res_alloc += res_alloc;
4075 if (res_alloc < oldsize || res_alloc > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004076 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004077 } while (new_res_used > res_alloc);
4078 if (_PyUnicode_Resize(&res, (int)res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004079 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004081 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004082 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004084
4085 /* Copy item, and maybe the separator. */
4086 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), (int)itemlen);
4087 res_p += itemlen;
4088 if (i < seqlen - 1) {
4089 Py_UNICODE_COPY(res_p, sep, (int)seplen);
4090 res_p += seplen;
4091 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004093 res_used = new_res_used;
4094 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004095
Tim Peters05eba1f2004-08-27 21:32:02 +00004096 /* Shrink res to match the used area; this probably can't fail,
4097 * but it's cheap to check.
4098 */
4099 if (_PyUnicode_Resize(&res, (int)res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004100 goto onError;
4101
4102 Done:
4103 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004104 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105 return (PyObject *)res;
4106
Tim Peters8ce9f162004-08-27 01:49:32 +00004107 Overflow:
4108 PyErr_SetString(PyExc_OverflowError,
4109 "join() is too long for a Python string");
4110 Py_DECREF(item);
4111 /* fall through */
4112
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004114 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004115 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004116 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117 return NULL;
4118}
4119
Tim Petersced69f82003-09-16 20:30:58 +00004120static
4121PyUnicodeObject *pad(PyUnicodeObject *self,
4122 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123 int right,
4124 Py_UNICODE fill)
4125{
4126 PyUnicodeObject *u;
4127
4128 if (left < 0)
4129 left = 0;
4130 if (right < 0)
4131 right = 0;
4132
Tim Peters7a29bd52001-09-12 03:03:31 +00004133 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134 Py_INCREF(self);
4135 return self;
4136 }
4137
4138 u = _PyUnicode_New(left + self->length + right);
4139 if (u) {
4140 if (left)
4141 Py_UNICODE_FILL(u->str, fill, left);
4142 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4143 if (right)
4144 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4145 }
4146
4147 return u;
4148}
4149
4150#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004151 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152 if (!str) \
4153 goto onError; \
4154 if (PyList_Append(list, str)) { \
4155 Py_DECREF(str); \
4156 goto onError; \
4157 } \
4158 else \
4159 Py_DECREF(str);
4160
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004161#define SPLIT_INSERT(data, left, right) \
4162 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4163 if (!str) \
4164 goto onError; \
4165 if (PyList_Insert(list, 0, str)) { \
4166 Py_DECREF(str); \
4167 goto onError; \
4168 } \
4169 else \
4170 Py_DECREF(str);
4171
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172static
4173PyObject *split_whitespace(PyUnicodeObject *self,
4174 PyObject *list,
4175 int maxcount)
4176{
4177 register int i;
4178 register int j;
4179 int len = self->length;
4180 PyObject *str;
4181
4182 for (i = j = 0; i < len; ) {
4183 /* find a token */
4184 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4185 i++;
4186 j = i;
4187 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4188 i++;
4189 if (j < i) {
4190 if (maxcount-- <= 0)
4191 break;
4192 SPLIT_APPEND(self->str, j, i);
4193 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4194 i++;
4195 j = i;
4196 }
4197 }
4198 if (j < len) {
4199 SPLIT_APPEND(self->str, j, len);
4200 }
4201 return list;
4202
4203 onError:
4204 Py_DECREF(list);
4205 return NULL;
4206}
4207
4208PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004209 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004210{
4211 register int i;
4212 register int j;
4213 int len;
4214 PyObject *list;
4215 PyObject *str;
4216 Py_UNICODE *data;
4217
4218 string = PyUnicode_FromObject(string);
4219 if (string == NULL)
4220 return NULL;
4221 data = PyUnicode_AS_UNICODE(string);
4222 len = PyUnicode_GET_SIZE(string);
4223
Guido van Rossumd57fd912000-03-10 22:53:23 +00004224 list = PyList_New(0);
4225 if (!list)
4226 goto onError;
4227
4228 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004229 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004230
Guido van Rossumd57fd912000-03-10 22:53:23 +00004231 /* Find a line and append it */
4232 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4233 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004234
4235 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004236 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004237 if (i < len) {
4238 if (data[i] == '\r' && i + 1 < len &&
4239 data[i+1] == '\n')
4240 i += 2;
4241 else
4242 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004243 if (keepends)
4244 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004245 }
Guido van Rossum86662912000-04-11 15:38:46 +00004246 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004247 j = i;
4248 }
4249 if (j < len) {
4250 SPLIT_APPEND(data, j, len);
4251 }
4252
4253 Py_DECREF(string);
4254 return list;
4255
4256 onError:
4257 Py_DECREF(list);
4258 Py_DECREF(string);
4259 return NULL;
4260}
4261
Tim Petersced69f82003-09-16 20:30:58 +00004262static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004263PyObject *split_char(PyUnicodeObject *self,
4264 PyObject *list,
4265 Py_UNICODE ch,
4266 int maxcount)
4267{
4268 register int i;
4269 register int j;
4270 int len = self->length;
4271 PyObject *str;
4272
4273 for (i = j = 0; i < len; ) {
4274 if (self->str[i] == ch) {
4275 if (maxcount-- <= 0)
4276 break;
4277 SPLIT_APPEND(self->str, j, i);
4278 i = j = i + 1;
4279 } else
4280 i++;
4281 }
4282 if (j <= len) {
4283 SPLIT_APPEND(self->str, j, len);
4284 }
4285 return list;
4286
4287 onError:
4288 Py_DECREF(list);
4289 return NULL;
4290}
4291
Tim Petersced69f82003-09-16 20:30:58 +00004292static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293PyObject *split_substring(PyUnicodeObject *self,
4294 PyObject *list,
4295 PyUnicodeObject *substring,
4296 int maxcount)
4297{
4298 register int i;
4299 register int j;
4300 int len = self->length;
4301 int sublen = substring->length;
4302 PyObject *str;
4303
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004304 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305 if (Py_UNICODE_MATCH(self, i, substring)) {
4306 if (maxcount-- <= 0)
4307 break;
4308 SPLIT_APPEND(self->str, j, i);
4309 i = j = i + sublen;
4310 } else
4311 i++;
4312 }
4313 if (j <= len) {
4314 SPLIT_APPEND(self->str, j, len);
4315 }
4316 return list;
4317
4318 onError:
4319 Py_DECREF(list);
4320 return NULL;
4321}
4322
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004323static
4324PyObject *rsplit_whitespace(PyUnicodeObject *self,
4325 PyObject *list,
4326 int maxcount)
4327{
4328 register int i;
4329 register int j;
4330 int len = self->length;
4331 PyObject *str;
4332
4333 for (i = j = len - 1; i >= 0; ) {
4334 /* find a token */
4335 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4336 i--;
4337 j = i;
4338 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4339 i--;
4340 if (j > i) {
4341 if (maxcount-- <= 0)
4342 break;
4343 SPLIT_INSERT(self->str, i + 1, j + 1);
4344 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4345 i--;
4346 j = i;
4347 }
4348 }
4349 if (j >= 0) {
4350 SPLIT_INSERT(self->str, 0, j + 1);
4351 }
4352 return list;
4353
4354 onError:
4355 Py_DECREF(list);
4356 return NULL;
4357}
4358
4359static
4360PyObject *rsplit_char(PyUnicodeObject *self,
4361 PyObject *list,
4362 Py_UNICODE ch,
4363 int maxcount)
4364{
4365 register int i;
4366 register int j;
4367 int len = self->length;
4368 PyObject *str;
4369
4370 for (i = j = len - 1; i >= 0; ) {
4371 if (self->str[i] == ch) {
4372 if (maxcount-- <= 0)
4373 break;
4374 SPLIT_INSERT(self->str, i + 1, j + 1);
4375 j = i = i - 1;
4376 } else
4377 i--;
4378 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004379 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004380 SPLIT_INSERT(self->str, 0, j + 1);
4381 }
4382 return list;
4383
4384 onError:
4385 Py_DECREF(list);
4386 return NULL;
4387}
4388
4389static
4390PyObject *rsplit_substring(PyUnicodeObject *self,
4391 PyObject *list,
4392 PyUnicodeObject *substring,
4393 int maxcount)
4394{
4395 register int i;
4396 register int j;
4397 int len = self->length;
4398 int sublen = substring->length;
4399 PyObject *str;
4400
4401 for (i = len - sublen, j = len; i >= 0; ) {
4402 if (Py_UNICODE_MATCH(self, i, substring)) {
4403 if (maxcount-- <= 0)
4404 break;
4405 SPLIT_INSERT(self->str, i + sublen, j);
4406 j = i;
4407 i -= sublen;
4408 } else
4409 i--;
4410 }
4411 if (j >= 0) {
4412 SPLIT_INSERT(self->str, 0, j);
4413 }
4414 return list;
4415
4416 onError:
4417 Py_DECREF(list);
4418 return NULL;
4419}
4420
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004422#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423
4424static
4425PyObject *split(PyUnicodeObject *self,
4426 PyUnicodeObject *substring,
4427 int maxcount)
4428{
4429 PyObject *list;
4430
4431 if (maxcount < 0)
4432 maxcount = INT_MAX;
4433
4434 list = PyList_New(0);
4435 if (!list)
4436 return NULL;
4437
4438 if (substring == NULL)
4439 return split_whitespace(self,list,maxcount);
4440
4441 else if (substring->length == 1)
4442 return split_char(self,list,substring->str[0],maxcount);
4443
4444 else if (substring->length == 0) {
4445 Py_DECREF(list);
4446 PyErr_SetString(PyExc_ValueError, "empty separator");
4447 return NULL;
4448 }
4449 else
4450 return split_substring(self,list,substring,maxcount);
4451}
4452
Tim Petersced69f82003-09-16 20:30:58 +00004453static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004454PyObject *rsplit(PyUnicodeObject *self,
4455 PyUnicodeObject *substring,
4456 int maxcount)
4457{
4458 PyObject *list;
4459
4460 if (maxcount < 0)
4461 maxcount = INT_MAX;
4462
4463 list = PyList_New(0);
4464 if (!list)
4465 return NULL;
4466
4467 if (substring == NULL)
4468 return rsplit_whitespace(self,list,maxcount);
4469
4470 else if (substring->length == 1)
4471 return rsplit_char(self,list,substring->str[0],maxcount);
4472
4473 else if (substring->length == 0) {
4474 Py_DECREF(list);
4475 PyErr_SetString(PyExc_ValueError, "empty separator");
4476 return NULL;
4477 }
4478 else
4479 return rsplit_substring(self,list,substring,maxcount);
4480}
4481
4482static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004483PyObject *replace(PyUnicodeObject *self,
4484 PyUnicodeObject *str1,
4485 PyUnicodeObject *str2,
4486 int maxcount)
4487{
4488 PyUnicodeObject *u;
4489
4490 if (maxcount < 0)
4491 maxcount = INT_MAX;
4492
4493 if (str1->length == 1 && str2->length == 1) {
4494 int i;
4495
4496 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004497 if (!findchar(self->str, self->length, str1->str[0]) &&
4498 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499 /* nothing to replace, return original string */
4500 Py_INCREF(self);
4501 u = self;
4502 } else {
4503 Py_UNICODE u1 = str1->str[0];
4504 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004505
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004507 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508 self->length
4509 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004510 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004511 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004512 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513 for (i = 0; i < u->length; i++)
4514 if (u->str[i] == u1) {
4515 if (--maxcount < 0)
4516 break;
4517 u->str[i] = u2;
4518 }
4519 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004520 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521
4522 } else {
4523 int n, i;
4524 Py_UNICODE *p;
4525
4526 /* replace strings */
4527 n = count(self, 0, self->length, str1);
4528 if (n > maxcount)
4529 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004530 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004531 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004532 if (PyUnicode_CheckExact(self)) {
4533 Py_INCREF(self);
4534 u = self;
4535 }
4536 else {
4537 u = (PyUnicodeObject *)
4538 PyUnicode_FromUnicode(self->str, self->length);
4539 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540 } else {
4541 u = _PyUnicode_New(
4542 self->length + n * (str2->length - str1->length));
4543 if (u) {
4544 i = 0;
4545 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004546 if (str1->length > 0) {
4547 while (i <= self->length - str1->length)
4548 if (Py_UNICODE_MATCH(self, i, str1)) {
4549 /* replace string segment */
4550 Py_UNICODE_COPY(p, str2->str, str2->length);
4551 p += str2->length;
4552 i += str1->length;
4553 if (--n <= 0) {
4554 /* copy remaining part */
4555 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4556 break;
4557 }
4558 } else
4559 *p++ = self->str[i++];
4560 } else {
4561 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562 Py_UNICODE_COPY(p, str2->str, str2->length);
4563 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004564 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004565 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004566 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004567 }
4568 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4569 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570 }
4571 }
4572 }
Tim Petersced69f82003-09-16 20:30:58 +00004573
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574 return (PyObject *) u;
4575}
4576
4577/* --- Unicode Object Methods --------------------------------------------- */
4578
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004579PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004580"S.title() -> unicode\n\
4581\n\
4582Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004583characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584
4585static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004586unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588 return fixup(self, fixtitle);
4589}
4590
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004591PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592"S.capitalize() -> unicode\n\
4593\n\
4594Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004595have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596
4597static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004598unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004600 return fixup(self, fixcapitalize);
4601}
4602
4603#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004604PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004605"S.capwords() -> unicode\n\
4606\n\
4607Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004608normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004609
4610static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004611unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612{
4613 PyObject *list;
4614 PyObject *item;
4615 int i;
4616
Guido van Rossumd57fd912000-03-10 22:53:23 +00004617 /* Split into words */
4618 list = split(self, NULL, -1);
4619 if (!list)
4620 return NULL;
4621
4622 /* Capitalize each word */
4623 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4624 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4625 fixcapitalize);
4626 if (item == NULL)
4627 goto onError;
4628 Py_DECREF(PyList_GET_ITEM(list, i));
4629 PyList_SET_ITEM(list, i, item);
4630 }
4631
4632 /* Join the words to form a new string */
4633 item = PyUnicode_Join(NULL, list);
4634
4635onError:
4636 Py_DECREF(list);
4637 return (PyObject *)item;
4638}
4639#endif
4640
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004641/* Argument converter. Coerces to a single unicode character */
4642
4643static int
4644convert_uc(PyObject *obj, void *addr)
4645{
4646 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4647 PyObject *uniobj;
4648 Py_UNICODE *unistr;
4649
4650 uniobj = PyUnicode_FromObject(obj);
4651 if (uniobj == NULL) {
4652 PyErr_SetString(PyExc_TypeError,
4653 "The fill character cannot be converted to Unicode");
4654 return 0;
4655 }
4656 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4657 PyErr_SetString(PyExc_TypeError,
4658 "The fill character must be exactly one character long");
4659 Py_DECREF(uniobj);
4660 return 0;
4661 }
4662 unistr = PyUnicode_AS_UNICODE(uniobj);
4663 *fillcharloc = unistr[0];
4664 Py_DECREF(uniobj);
4665 return 1;
4666}
4667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004668PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004669"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004670\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004671Return S centered in a Unicode string of length width. Padding is\n\
4672done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004673
4674static PyObject *
4675unicode_center(PyUnicodeObject *self, PyObject *args)
4676{
4677 int marg, left;
4678 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004679 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004681 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682 return NULL;
4683
Tim Peters7a29bd52001-09-12 03:03:31 +00004684 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685 Py_INCREF(self);
4686 return (PyObject*) self;
4687 }
4688
4689 marg = width - self->length;
4690 left = marg / 2 + (marg & width & 1);
4691
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004692 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693}
4694
Marc-André Lemburge5034372000-08-08 08:04:29 +00004695#if 0
4696
4697/* This code should go into some future Unicode collation support
4698 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004699 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004700
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004701/* speedy UTF-16 code point order comparison */
4702/* gleaned from: */
4703/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4704
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004705static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004706{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004707 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004708 0, 0, 0, 0, 0, 0, 0, 0,
4709 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004710 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004711};
4712
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713static int
4714unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4715{
4716 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004717
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718 Py_UNICODE *s1 = str1->str;
4719 Py_UNICODE *s2 = str2->str;
4720
4721 len1 = str1->length;
4722 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004723
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004725 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004726
4727 c1 = *s1++;
4728 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004729
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004730 if (c1 > (1<<11) * 26)
4731 c1 += utf16Fixup[c1>>11];
4732 if (c2 > (1<<11) * 26)
4733 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004734 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004735
4736 if (c1 != c2)
4737 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004738
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004739 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 }
4741
4742 return (len1 < len2) ? -1 : (len1 != len2);
4743}
4744
Marc-André Lemburge5034372000-08-08 08:04:29 +00004745#else
4746
4747static int
4748unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4749{
4750 register int len1, len2;
4751
4752 Py_UNICODE *s1 = str1->str;
4753 Py_UNICODE *s2 = str2->str;
4754
4755 len1 = str1->length;
4756 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004757
Marc-André Lemburge5034372000-08-08 08:04:29 +00004758 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004759 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004760
Fredrik Lundh45714e92001-06-26 16:39:36 +00004761 c1 = *s1++;
4762 c2 = *s2++;
4763
4764 if (c1 != c2)
4765 return (c1 < c2) ? -1 : 1;
4766
Marc-André Lemburge5034372000-08-08 08:04:29 +00004767 len1--; len2--;
4768 }
4769
4770 return (len1 < len2) ? -1 : (len1 != len2);
4771}
4772
4773#endif
4774
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775int PyUnicode_Compare(PyObject *left,
4776 PyObject *right)
4777{
4778 PyUnicodeObject *u = NULL, *v = NULL;
4779 int result;
4780
4781 /* Coerce the two arguments */
4782 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4783 if (u == NULL)
4784 goto onError;
4785 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4786 if (v == NULL)
4787 goto onError;
4788
Thomas Wouters7e474022000-07-16 12:04:32 +00004789 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790 if (v == u) {
4791 Py_DECREF(u);
4792 Py_DECREF(v);
4793 return 0;
4794 }
4795
4796 result = unicode_compare(u, v);
4797
4798 Py_DECREF(u);
4799 Py_DECREF(v);
4800 return result;
4801
4802onError:
4803 Py_XDECREF(u);
4804 Py_XDECREF(v);
4805 return -1;
4806}
4807
Guido van Rossum403d68b2000-03-13 15:55:09 +00004808int PyUnicode_Contains(PyObject *container,
4809 PyObject *element)
4810{
4811 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004812 int result, size;
4813 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004814
4815 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004816 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004817 if (v == NULL) {
4818 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004819 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004820 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004821 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004822 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004823 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004824 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004825
Barry Warsaw817918c2002-08-06 16:58:21 +00004826 size = PyUnicode_GET_SIZE(v);
4827 rhs = PyUnicode_AS_UNICODE(v);
4828 lhs = PyUnicode_AS_UNICODE(u);
4829
Guido van Rossum403d68b2000-03-13 15:55:09 +00004830 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004831 if (size == 1) {
4832 end = lhs + PyUnicode_GET_SIZE(u);
4833 while (lhs < end) {
4834 if (*lhs++ == *rhs) {
4835 result = 1;
4836 break;
4837 }
4838 }
4839 }
4840 else {
4841 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4842 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004843 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004844 result = 1;
4845 break;
4846 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004847 }
4848 }
4849
4850 Py_DECREF(u);
4851 Py_DECREF(v);
4852 return result;
4853
4854onError:
4855 Py_XDECREF(u);
4856 Py_XDECREF(v);
4857 return -1;
4858}
4859
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860/* Concat to string or Unicode object giving a new Unicode object. */
4861
4862PyObject *PyUnicode_Concat(PyObject *left,
4863 PyObject *right)
4864{
4865 PyUnicodeObject *u = NULL, *v = NULL, *w;
4866
4867 /* Coerce the two arguments */
4868 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4869 if (u == NULL)
4870 goto onError;
4871 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4872 if (v == NULL)
4873 goto onError;
4874
4875 /* Shortcuts */
4876 if (v == unicode_empty) {
4877 Py_DECREF(v);
4878 return (PyObject *)u;
4879 }
4880 if (u == unicode_empty) {
4881 Py_DECREF(u);
4882 return (PyObject *)v;
4883 }
4884
4885 /* Concat the two Unicode strings */
4886 w = _PyUnicode_New(u->length + v->length);
4887 if (w == NULL)
4888 goto onError;
4889 Py_UNICODE_COPY(w->str, u->str, u->length);
4890 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4891
4892 Py_DECREF(u);
4893 Py_DECREF(v);
4894 return (PyObject *)w;
4895
4896onError:
4897 Py_XDECREF(u);
4898 Py_XDECREF(v);
4899 return NULL;
4900}
4901
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004902PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903"S.count(sub[, start[, end]]) -> int\n\
4904\n\
4905Return the number of occurrences of substring sub in Unicode string\n\
4906S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004907interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908
4909static PyObject *
4910unicode_count(PyUnicodeObject *self, PyObject *args)
4911{
4912 PyUnicodeObject *substring;
4913 int start = 0;
4914 int end = INT_MAX;
4915 PyObject *result;
4916
Guido van Rossumb8872e62000-05-09 14:14:27 +00004917 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4918 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919 return NULL;
4920
4921 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4922 (PyObject *)substring);
4923 if (substring == NULL)
4924 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004925
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926 if (start < 0)
4927 start += self->length;
4928 if (start < 0)
4929 start = 0;
4930 if (end > self->length)
4931 end = self->length;
4932 if (end < 0)
4933 end += self->length;
4934 if (end < 0)
4935 end = 0;
4936
4937 result = PyInt_FromLong((long) count(self, start, end, substring));
4938
4939 Py_DECREF(substring);
4940 return result;
4941}
4942
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004943PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004944"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004946Encodes S using the codec registered for encoding. encoding defaults\n\
4947to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004948handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004949a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4950'xmlcharrefreplace' as well as any other name registered with\n\
4951codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952
4953static PyObject *
4954unicode_encode(PyUnicodeObject *self, PyObject *args)
4955{
4956 char *encoding = NULL;
4957 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004958 PyObject *v;
4959
Guido van Rossumd57fd912000-03-10 22:53:23 +00004960 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4961 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004962 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00004963 if (v == NULL)
4964 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004965 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
4966 PyErr_Format(PyExc_TypeError,
4967 "encoder did not return a string/unicode object "
4968 "(type=%.400s)",
4969 v->ob_type->tp_name);
4970 Py_DECREF(v);
4971 return NULL;
4972 }
4973 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00004974
4975 onError:
4976 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004977}
4978
4979PyDoc_STRVAR(decode__doc__,
4980"S.decode([encoding[,errors]]) -> string or unicode\n\
4981\n\
4982Decodes S using the codec registered for encoding. encoding defaults\n\
4983to the default encoding. errors may be given to set a different error\n\
4984handling scheme. Default is 'strict' meaning that encoding errors raise\n\
4985a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
4986as well as any other name registerd with codecs.register_error that is\n\
4987able to handle UnicodeDecodeErrors.");
4988
4989static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00004990unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004991{
4992 char *encoding = NULL;
4993 char *errors = NULL;
4994 PyObject *v;
4995
4996 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
4997 return NULL;
4998 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00004999 if (v == NULL)
5000 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005001 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5002 PyErr_Format(PyExc_TypeError,
5003 "decoder did not return a string/unicode object "
5004 "(type=%.400s)",
5005 v->ob_type->tp_name);
5006 Py_DECREF(v);
5007 return NULL;
5008 }
5009 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005010
5011 onError:
5012 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013}
5014
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005015PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016"S.expandtabs([tabsize]) -> unicode\n\
5017\n\
5018Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005019If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020
5021static PyObject*
5022unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5023{
5024 Py_UNICODE *e;
5025 Py_UNICODE *p;
5026 Py_UNICODE *q;
5027 int i, j;
5028 PyUnicodeObject *u;
5029 int tabsize = 8;
5030
5031 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5032 return NULL;
5033
Thomas Wouters7e474022000-07-16 12:04:32 +00005034 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035 i = j = 0;
5036 e = self->str + self->length;
5037 for (p = self->str; p < e; p++)
5038 if (*p == '\t') {
5039 if (tabsize > 0)
5040 j += tabsize - (j % tabsize);
5041 }
5042 else {
5043 j++;
5044 if (*p == '\n' || *p == '\r') {
5045 i += j;
5046 j = 0;
5047 }
5048 }
5049
5050 /* Second pass: create output string and fill it */
5051 u = _PyUnicode_New(i + j);
5052 if (!u)
5053 return NULL;
5054
5055 j = 0;
5056 q = u->str;
5057
5058 for (p = self->str; p < e; p++)
5059 if (*p == '\t') {
5060 if (tabsize > 0) {
5061 i = tabsize - (j % tabsize);
5062 j += i;
5063 while (i--)
5064 *q++ = ' ';
5065 }
5066 }
5067 else {
5068 j++;
5069 *q++ = *p;
5070 if (*p == '\n' || *p == '\r')
5071 j = 0;
5072 }
5073
5074 return (PyObject*) u;
5075}
5076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005077PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078"S.find(sub [,start [,end]]) -> int\n\
5079\n\
5080Return the lowest index in S where substring sub is found,\n\
5081such that sub is contained within s[start,end]. Optional\n\
5082arguments start and end are interpreted as in slice notation.\n\
5083\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005084Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085
5086static PyObject *
5087unicode_find(PyUnicodeObject *self, PyObject *args)
5088{
5089 PyUnicodeObject *substring;
5090 int start = 0;
5091 int end = INT_MAX;
5092 PyObject *result;
5093
Guido van Rossumb8872e62000-05-09 14:14:27 +00005094 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5095 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096 return NULL;
5097 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5098 (PyObject *)substring);
5099 if (substring == NULL)
5100 return NULL;
5101
5102 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
5103
5104 Py_DECREF(substring);
5105 return result;
5106}
5107
5108static PyObject *
5109unicode_getitem(PyUnicodeObject *self, int index)
5110{
5111 if (index < 0 || index >= self->length) {
5112 PyErr_SetString(PyExc_IndexError, "string index out of range");
5113 return NULL;
5114 }
5115
5116 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5117}
5118
5119static long
5120unicode_hash(PyUnicodeObject *self)
5121{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005122 /* Since Unicode objects compare equal to their ASCII string
5123 counterparts, they should use the individual character values
5124 as basis for their hash value. This is needed to assure that
5125 strings and Unicode objects behave in the same way as
5126 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127
Fredrik Lundhdde61642000-07-10 18:27:47 +00005128 register int len;
5129 register Py_UNICODE *p;
5130 register long x;
5131
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132 if (self->hash != -1)
5133 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005134 len = PyUnicode_GET_SIZE(self);
5135 p = PyUnicode_AS_UNICODE(self);
5136 x = *p << 7;
5137 while (--len >= 0)
5138 x = (1000003*x) ^ *p++;
5139 x ^= PyUnicode_GET_SIZE(self);
5140 if (x == -1)
5141 x = -2;
5142 self->hash = x;
5143 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144}
5145
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005146PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147"S.index(sub [,start [,end]]) -> int\n\
5148\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005149Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150
5151static PyObject *
5152unicode_index(PyUnicodeObject *self, PyObject *args)
5153{
5154 int result;
5155 PyUnicodeObject *substring;
5156 int start = 0;
5157 int end = INT_MAX;
5158
Guido van Rossumb8872e62000-05-09 14:14:27 +00005159 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5160 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005162
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5164 (PyObject *)substring);
5165 if (substring == NULL)
5166 return NULL;
5167
5168 result = findstring(self, substring, start, end, 1);
5169
5170 Py_DECREF(substring);
5171 if (result < 0) {
5172 PyErr_SetString(PyExc_ValueError, "substring not found");
5173 return NULL;
5174 }
5175 return PyInt_FromLong(result);
5176}
5177
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005178PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005179"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005181Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005182at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183
5184static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005185unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186{
5187 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5188 register const Py_UNICODE *e;
5189 int cased;
5190
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191 /* Shortcut for single character strings */
5192 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005193 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005195 /* Special case for empty strings */
5196 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005197 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005198
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199 e = p + PyUnicode_GET_SIZE(self);
5200 cased = 0;
5201 for (; p < e; p++) {
5202 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005203
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005205 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206 else if (!cased && Py_UNICODE_ISLOWER(ch))
5207 cased = 1;
5208 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005209 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210}
5211
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005212PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005213"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005215Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005216at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217
5218static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005219unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220{
5221 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5222 register const Py_UNICODE *e;
5223 int cased;
5224
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225 /* Shortcut for single character strings */
5226 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005227 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005229 /* Special case for empty strings */
5230 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005231 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005232
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233 e = p + PyUnicode_GET_SIZE(self);
5234 cased = 0;
5235 for (; p < e; p++) {
5236 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005237
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005239 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 else if (!cased && Py_UNICODE_ISUPPER(ch))
5241 cased = 1;
5242 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005243 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244}
5245
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005246PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005247"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005249Return True if S is a titlecased string and there is at least one\n\
5250character in S, i.e. upper- and titlecase characters may only\n\
5251follow uncased characters and lowercase characters only cased ones.\n\
5252Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253
5254static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005255unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256{
5257 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5258 register const Py_UNICODE *e;
5259 int cased, previous_is_cased;
5260
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261 /* Shortcut for single character strings */
5262 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005263 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5264 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005266 /* Special case for empty strings */
5267 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005268 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005269
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270 e = p + PyUnicode_GET_SIZE(self);
5271 cased = 0;
5272 previous_is_cased = 0;
5273 for (; p < e; p++) {
5274 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005275
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5277 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005278 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279 previous_is_cased = 1;
5280 cased = 1;
5281 }
5282 else if (Py_UNICODE_ISLOWER(ch)) {
5283 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005284 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285 previous_is_cased = 1;
5286 cased = 1;
5287 }
5288 else
5289 previous_is_cased = 0;
5290 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005291 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292}
5293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005294PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005295"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005297Return True if all characters in S are whitespace\n\
5298and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299
5300static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005301unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302{
5303 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5304 register const Py_UNICODE *e;
5305
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306 /* Shortcut for single character strings */
5307 if (PyUnicode_GET_SIZE(self) == 1 &&
5308 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005309 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005311 /* Special case for empty strings */
5312 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005313 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005314
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315 e = p + PyUnicode_GET_SIZE(self);
5316 for (; p < e; p++) {
5317 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005318 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005320 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321}
5322
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005323PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005324"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005325\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005326Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005327and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005328
5329static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005330unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005331{
5332 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5333 register const Py_UNICODE *e;
5334
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005335 /* Shortcut for single character strings */
5336 if (PyUnicode_GET_SIZE(self) == 1 &&
5337 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005338 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005339
5340 /* Special case for empty strings */
5341 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005342 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005343
5344 e = p + PyUnicode_GET_SIZE(self);
5345 for (; p < e; p++) {
5346 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005347 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005348 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005349 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005350}
5351
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005352PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005353"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005354\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005355Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005356and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005357
5358static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005359unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005360{
5361 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5362 register const Py_UNICODE *e;
5363
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005364 /* Shortcut for single character strings */
5365 if (PyUnicode_GET_SIZE(self) == 1 &&
5366 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005367 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005368
5369 /* Special case for empty strings */
5370 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005371 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005372
5373 e = p + PyUnicode_GET_SIZE(self);
5374 for (; p < e; p++) {
5375 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005376 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005377 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005378 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005379}
5380
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005381PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005382"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005384Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005385False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386
5387static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005388unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389{
5390 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5391 register const Py_UNICODE *e;
5392
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393 /* Shortcut for single character strings */
5394 if (PyUnicode_GET_SIZE(self) == 1 &&
5395 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005396 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005398 /* Special case for empty strings */
5399 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005400 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005401
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 e = p + PyUnicode_GET_SIZE(self);
5403 for (; p < e; p++) {
5404 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005405 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005407 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408}
5409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005410PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005411"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005413Return True if all characters in S are digits\n\
5414and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415
5416static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005417unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418{
5419 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5420 register const Py_UNICODE *e;
5421
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 /* Shortcut for single character strings */
5423 if (PyUnicode_GET_SIZE(self) == 1 &&
5424 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005425 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005427 /* Special case for empty strings */
5428 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005429 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005430
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431 e = p + PyUnicode_GET_SIZE(self);
5432 for (; p < e; p++) {
5433 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005434 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005436 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437}
5438
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005439PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005440"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005442Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005443False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444
5445static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005446unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447{
5448 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5449 register const Py_UNICODE *e;
5450
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 /* Shortcut for single character strings */
5452 if (PyUnicode_GET_SIZE(self) == 1 &&
5453 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005454 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005456 /* Special case for empty strings */
5457 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005458 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005459
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 e = p + PyUnicode_GET_SIZE(self);
5461 for (; p < e; p++) {
5462 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005463 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005465 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466}
5467
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005468PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469"S.join(sequence) -> unicode\n\
5470\n\
5471Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005472sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473
5474static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005475unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005477 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478}
5479
5480static int
5481unicode_length(PyUnicodeObject *self)
5482{
5483 return self->length;
5484}
5485
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005486PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005487"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488\n\
5489Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005490done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491
5492static PyObject *
5493unicode_ljust(PyUnicodeObject *self, PyObject *args)
5494{
5495 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005496 Py_UNICODE fillchar = ' ';
5497
5498 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499 return NULL;
5500
Tim Peters7a29bd52001-09-12 03:03:31 +00005501 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502 Py_INCREF(self);
5503 return (PyObject*) self;
5504 }
5505
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005506 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507}
5508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005509PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510"S.lower() -> unicode\n\
5511\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005512Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513
5514static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005515unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 return fixup(self, fixlower);
5518}
5519
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005520#define LEFTSTRIP 0
5521#define RIGHTSTRIP 1
5522#define BOTHSTRIP 2
5523
5524/* Arrays indexed by above */
5525static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5526
5527#define STRIPNAME(i) (stripformat[i]+3)
5528
5529static const Py_UNICODE *
5530unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5531{
Tim Peters030a5ce2002-04-22 19:00:10 +00005532 size_t i;
5533 for (i = 0; i < n; ++i)
5534 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005535 return s+i;
5536 return NULL;
5537}
5538
5539/* externally visible for str.strip(unicode) */
5540PyObject *
5541_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5542{
5543 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5544 int len = PyUnicode_GET_SIZE(self);
5545 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5546 int seplen = PyUnicode_GET_SIZE(sepobj);
5547 int i, j;
5548
5549 i = 0;
5550 if (striptype != RIGHTSTRIP) {
5551 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5552 i++;
5553 }
5554 }
5555
5556 j = len;
5557 if (striptype != LEFTSTRIP) {
5558 do {
5559 j--;
5560 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5561 j++;
5562 }
5563
5564 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5565 Py_INCREF(self);
5566 return (PyObject*)self;
5567 }
5568 else
5569 return PyUnicode_FromUnicode(s+i, j-i);
5570}
5571
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572
5573static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005574do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005576 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5577 int len = PyUnicode_GET_SIZE(self), i, j;
5578
5579 i = 0;
5580 if (striptype != RIGHTSTRIP) {
5581 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5582 i++;
5583 }
5584 }
5585
5586 j = len;
5587 if (striptype != LEFTSTRIP) {
5588 do {
5589 j--;
5590 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5591 j++;
5592 }
5593
5594 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5595 Py_INCREF(self);
5596 return (PyObject*)self;
5597 }
5598 else
5599 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600}
5601
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005602
5603static PyObject *
5604do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5605{
5606 PyObject *sep = NULL;
5607
5608 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5609 return NULL;
5610
5611 if (sep != NULL && sep != Py_None) {
5612 if (PyUnicode_Check(sep))
5613 return _PyUnicode_XStrip(self, striptype, sep);
5614 else if (PyString_Check(sep)) {
5615 PyObject *res;
5616 sep = PyUnicode_FromObject(sep);
5617 if (sep==NULL)
5618 return NULL;
5619 res = _PyUnicode_XStrip(self, striptype, sep);
5620 Py_DECREF(sep);
5621 return res;
5622 }
5623 else {
5624 PyErr_Format(PyExc_TypeError,
5625 "%s arg must be None, unicode or str",
5626 STRIPNAME(striptype));
5627 return NULL;
5628 }
5629 }
5630
5631 return do_strip(self, striptype);
5632}
5633
5634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005635PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005636"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005637\n\
5638Return a copy of the string S with leading and trailing\n\
5639whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005640If chars is given and not None, remove characters in chars instead.\n\
5641If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005642
5643static PyObject *
5644unicode_strip(PyUnicodeObject *self, PyObject *args)
5645{
5646 if (PyTuple_GET_SIZE(args) == 0)
5647 return do_strip(self, BOTHSTRIP); /* Common case */
5648 else
5649 return do_argstrip(self, BOTHSTRIP, args);
5650}
5651
5652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005653PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005654"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005655\n\
5656Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005657If chars is given and not None, remove characters in chars instead.\n\
5658If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005659
5660static PyObject *
5661unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5662{
5663 if (PyTuple_GET_SIZE(args) == 0)
5664 return do_strip(self, LEFTSTRIP); /* Common case */
5665 else
5666 return do_argstrip(self, LEFTSTRIP, args);
5667}
5668
5669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005670PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005671"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005672\n\
5673Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005674If chars is given and not None, remove characters in chars instead.\n\
5675If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005676
5677static PyObject *
5678unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5679{
5680 if (PyTuple_GET_SIZE(args) == 0)
5681 return do_strip(self, RIGHTSTRIP); /* Common case */
5682 else
5683 return do_argstrip(self, RIGHTSTRIP, args);
5684}
5685
5686
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687static PyObject*
5688unicode_repeat(PyUnicodeObject *str, int len)
5689{
5690 PyUnicodeObject *u;
5691 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005692 int nchars;
5693 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694
5695 if (len < 0)
5696 len = 0;
5697
Tim Peters7a29bd52001-09-12 03:03:31 +00005698 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 /* no repeat, return original string */
5700 Py_INCREF(str);
5701 return (PyObject*) str;
5702 }
Tim Peters8f422462000-09-09 06:13:41 +00005703
5704 /* ensure # of chars needed doesn't overflow int and # of bytes
5705 * needed doesn't overflow size_t
5706 */
5707 nchars = len * str->length;
5708 if (len && nchars / len != str->length) {
5709 PyErr_SetString(PyExc_OverflowError,
5710 "repeated string is too long");
5711 return NULL;
5712 }
5713 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5714 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5715 PyErr_SetString(PyExc_OverflowError,
5716 "repeated string is too long");
5717 return NULL;
5718 }
5719 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 if (!u)
5721 return NULL;
5722
5723 p = u->str;
5724
5725 while (len-- > 0) {
5726 Py_UNICODE_COPY(p, str->str, str->length);
5727 p += str->length;
5728 }
5729
5730 return (PyObject*) u;
5731}
5732
5733PyObject *PyUnicode_Replace(PyObject *obj,
5734 PyObject *subobj,
5735 PyObject *replobj,
5736 int maxcount)
5737{
5738 PyObject *self;
5739 PyObject *str1;
5740 PyObject *str2;
5741 PyObject *result;
5742
5743 self = PyUnicode_FromObject(obj);
5744 if (self == NULL)
5745 return NULL;
5746 str1 = PyUnicode_FromObject(subobj);
5747 if (str1 == NULL) {
5748 Py_DECREF(self);
5749 return NULL;
5750 }
5751 str2 = PyUnicode_FromObject(replobj);
5752 if (str2 == NULL) {
5753 Py_DECREF(self);
5754 Py_DECREF(str1);
5755 return NULL;
5756 }
Tim Petersced69f82003-09-16 20:30:58 +00005757 result = replace((PyUnicodeObject *)self,
5758 (PyUnicodeObject *)str1,
5759 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 maxcount);
5761 Py_DECREF(self);
5762 Py_DECREF(str1);
5763 Py_DECREF(str2);
5764 return result;
5765}
5766
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005767PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768"S.replace (old, new[, maxsplit]) -> unicode\n\
5769\n\
5770Return a copy of S with all occurrences of substring\n\
5771old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005772given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773
5774static PyObject*
5775unicode_replace(PyUnicodeObject *self, PyObject *args)
5776{
5777 PyUnicodeObject *str1;
5778 PyUnicodeObject *str2;
5779 int maxcount = -1;
5780 PyObject *result;
5781
5782 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5783 return NULL;
5784 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5785 if (str1 == NULL)
5786 return NULL;
5787 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005788 if (str2 == NULL) {
5789 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792
5793 result = replace(self, str1, str2, maxcount);
5794
5795 Py_DECREF(str1);
5796 Py_DECREF(str2);
5797 return result;
5798}
5799
5800static
5801PyObject *unicode_repr(PyObject *unicode)
5802{
5803 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5804 PyUnicode_GET_SIZE(unicode),
5805 1);
5806}
5807
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005808PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809"S.rfind(sub [,start [,end]]) -> int\n\
5810\n\
5811Return the highest index in S where substring sub is found,\n\
5812such that sub is contained within s[start,end]. Optional\n\
5813arguments start and end are interpreted as in slice notation.\n\
5814\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005815Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816
5817static PyObject *
5818unicode_rfind(PyUnicodeObject *self, PyObject *args)
5819{
5820 PyUnicodeObject *substring;
5821 int start = 0;
5822 int end = INT_MAX;
5823 PyObject *result;
5824
Guido van Rossumb8872e62000-05-09 14:14:27 +00005825 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5826 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827 return NULL;
5828 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5829 (PyObject *)substring);
5830 if (substring == NULL)
5831 return NULL;
5832
5833 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5834
5835 Py_DECREF(substring);
5836 return result;
5837}
5838
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005839PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840"S.rindex(sub [,start [,end]]) -> int\n\
5841\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005842Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843
5844static PyObject *
5845unicode_rindex(PyUnicodeObject *self, PyObject *args)
5846{
5847 int result;
5848 PyUnicodeObject *substring;
5849 int start = 0;
5850 int end = INT_MAX;
5851
Guido van Rossumb8872e62000-05-09 14:14:27 +00005852 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5853 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 return NULL;
5855 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5856 (PyObject *)substring);
5857 if (substring == NULL)
5858 return NULL;
5859
5860 result = findstring(self, substring, start, end, -1);
5861
5862 Py_DECREF(substring);
5863 if (result < 0) {
5864 PyErr_SetString(PyExc_ValueError, "substring not found");
5865 return NULL;
5866 }
5867 return PyInt_FromLong(result);
5868}
5869
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005870PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005871"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872\n\
5873Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005874done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875
5876static PyObject *
5877unicode_rjust(PyUnicodeObject *self, PyObject *args)
5878{
5879 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005880 Py_UNICODE fillchar = ' ';
5881
5882 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883 return NULL;
5884
Tim Peters7a29bd52001-09-12 03:03:31 +00005885 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886 Py_INCREF(self);
5887 return (PyObject*) self;
5888 }
5889
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005890 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891}
5892
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893static PyObject*
5894unicode_slice(PyUnicodeObject *self, int start, int end)
5895{
5896 /* standard clamping */
5897 if (start < 0)
5898 start = 0;
5899 if (end < 0)
5900 end = 0;
5901 if (end > self->length)
5902 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005903 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 /* full slice, return original string */
5905 Py_INCREF(self);
5906 return (PyObject*) self;
5907 }
5908 if (start > end)
5909 start = end;
5910 /* copy slice */
5911 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5912 end - start);
5913}
5914
5915PyObject *PyUnicode_Split(PyObject *s,
5916 PyObject *sep,
5917 int maxsplit)
5918{
5919 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005920
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921 s = PyUnicode_FromObject(s);
5922 if (s == NULL)
5923 return NULL;
5924 if (sep != NULL) {
5925 sep = PyUnicode_FromObject(sep);
5926 if (sep == NULL) {
5927 Py_DECREF(s);
5928 return NULL;
5929 }
5930 }
5931
5932 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5933
5934 Py_DECREF(s);
5935 Py_XDECREF(sep);
5936 return result;
5937}
5938
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005939PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940"S.split([sep [,maxsplit]]) -> list of strings\n\
5941\n\
5942Return a list of the words in S, using sep as the\n\
5943delimiter string. If maxsplit is given, at most maxsplit\n\
5944splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005945is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946
5947static PyObject*
5948unicode_split(PyUnicodeObject *self, PyObject *args)
5949{
5950 PyObject *substring = Py_None;
5951 int maxcount = -1;
5952
5953 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5954 return NULL;
5955
5956 if (substring == Py_None)
5957 return split(self, NULL, maxcount);
5958 else if (PyUnicode_Check(substring))
5959 return split(self, (PyUnicodeObject *)substring, maxcount);
5960 else
5961 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5962}
5963
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005964PyObject *PyUnicode_RSplit(PyObject *s,
5965 PyObject *sep,
5966 int maxsplit)
5967{
5968 PyObject *result;
5969
5970 s = PyUnicode_FromObject(s);
5971 if (s == NULL)
5972 return NULL;
5973 if (sep != NULL) {
5974 sep = PyUnicode_FromObject(sep);
5975 if (sep == NULL) {
5976 Py_DECREF(s);
5977 return NULL;
5978 }
5979 }
5980
5981 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5982
5983 Py_DECREF(s);
5984 Py_XDECREF(sep);
5985 return result;
5986}
5987
5988PyDoc_STRVAR(rsplit__doc__,
5989"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
5990\n\
5991Return a list of the words in S, using sep as the\n\
5992delimiter string, starting at the end of the string and\n\
5993working to the front. If maxsplit is given, at most maxsplit\n\
5994splits are done. If sep is not specified, any whitespace string\n\
5995is a separator.");
5996
5997static PyObject*
5998unicode_rsplit(PyUnicodeObject *self, PyObject *args)
5999{
6000 PyObject *substring = Py_None;
6001 int maxcount = -1;
6002
6003 if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
6004 return NULL;
6005
6006 if (substring == Py_None)
6007 return rsplit(self, NULL, maxcount);
6008 else if (PyUnicode_Check(substring))
6009 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6010 else
6011 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6012}
6013
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006014PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006015"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016\n\
6017Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006018Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006019is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020
6021static PyObject*
6022unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6023{
Guido van Rossum86662912000-04-11 15:38:46 +00006024 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025
Guido van Rossum86662912000-04-11 15:38:46 +00006026 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 return NULL;
6028
Guido van Rossum86662912000-04-11 15:38:46 +00006029 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030}
6031
6032static
6033PyObject *unicode_str(PyUnicodeObject *self)
6034{
Fred Drakee4315f52000-05-09 19:53:39 +00006035 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036}
6037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006038PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039"S.swapcase() -> unicode\n\
6040\n\
6041Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006042and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043
6044static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006045unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 return fixup(self, fixswapcase);
6048}
6049
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006050PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051"S.translate(table) -> unicode\n\
6052\n\
6053Return a copy of the string S, where all characters have been mapped\n\
6054through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006055Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6056Unmapped characters are left untouched. Characters mapped to None\n\
6057are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058
6059static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006060unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061{
Tim Petersced69f82003-09-16 20:30:58 +00006062 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006064 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065 "ignore");
6066}
6067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006068PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069"S.upper() -> unicode\n\
6070\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006071Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072
6073static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006074unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 return fixup(self, fixupper);
6077}
6078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006079PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080"S.zfill(width) -> unicode\n\
6081\n\
6082Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006083of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084
6085static PyObject *
6086unicode_zfill(PyUnicodeObject *self, PyObject *args)
6087{
6088 int fill;
6089 PyUnicodeObject *u;
6090
6091 int width;
6092 if (!PyArg_ParseTuple(args, "i:zfill", &width))
6093 return NULL;
6094
6095 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006096 if (PyUnicode_CheckExact(self)) {
6097 Py_INCREF(self);
6098 return (PyObject*) self;
6099 }
6100 else
6101 return PyUnicode_FromUnicode(
6102 PyUnicode_AS_UNICODE(self),
6103 PyUnicode_GET_SIZE(self)
6104 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 }
6106
6107 fill = width - self->length;
6108
6109 u = pad(self, fill, 0, '0');
6110
Walter Dörwald068325e2002-04-15 13:36:47 +00006111 if (u == NULL)
6112 return NULL;
6113
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 if (u->str[fill] == '+' || u->str[fill] == '-') {
6115 /* move sign to beginning of string */
6116 u->str[0] = u->str[fill];
6117 u->str[fill] = '0';
6118 }
6119
6120 return (PyObject*) u;
6121}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122
6123#if 0
6124static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006125unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 return PyInt_FromLong(unicode_freelist_size);
6128}
6129#endif
6130
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006131PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006132"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006134Return True if S starts with the specified prefix, False otherwise.\n\
6135With optional start, test S beginning at that position.\n\
6136With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137
6138static PyObject *
6139unicode_startswith(PyUnicodeObject *self,
6140 PyObject *args)
6141{
6142 PyUnicodeObject *substring;
6143 int start = 0;
6144 int end = INT_MAX;
6145 PyObject *result;
6146
Guido van Rossumb8872e62000-05-09 14:14:27 +00006147 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6148 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 return NULL;
6150 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6151 (PyObject *)substring);
6152 if (substring == NULL)
6153 return NULL;
6154
Guido van Rossum77f6a652002-04-03 22:41:51 +00006155 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156
6157 Py_DECREF(substring);
6158 return result;
6159}
6160
6161
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006162PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006163"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006165Return True if S ends with the specified suffix, False otherwise.\n\
6166With optional start, test S beginning at that position.\n\
6167With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168
6169static PyObject *
6170unicode_endswith(PyUnicodeObject *self,
6171 PyObject *args)
6172{
6173 PyUnicodeObject *substring;
6174 int start = 0;
6175 int end = INT_MAX;
6176 PyObject *result;
6177
Guido van Rossumb8872e62000-05-09 14:14:27 +00006178 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6179 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180 return NULL;
6181 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6182 (PyObject *)substring);
6183 if (substring == NULL)
6184 return NULL;
6185
Guido van Rossum77f6a652002-04-03 22:41:51 +00006186 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187
6188 Py_DECREF(substring);
6189 return result;
6190}
6191
6192
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006193
6194static PyObject *
6195unicode_getnewargs(PyUnicodeObject *v)
6196{
6197 return Py_BuildValue("(u#)", v->str, v->length);
6198}
6199
6200
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201static PyMethodDef unicode_methods[] = {
6202
6203 /* Order is according to common usage: often used methods should
6204 appear first, since lookup is done sequentially. */
6205
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006206 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6207 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6208 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006209 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006210 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6211 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6212 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6213 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6214 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6215 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6216 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6217 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6218 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6219 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006220 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006221 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006222/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6223 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6224 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6225 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006226 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006227 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006228 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006229 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6230 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6231 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6232 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6233 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6234 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6235 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6236 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6237 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6238 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6239 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6240 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6241 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6242 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006243 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006244#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006245 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246#endif
6247
6248#if 0
6249 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006250 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251#endif
6252
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006253 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254 {NULL, NULL}
6255};
6256
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006257static PyObject *
6258unicode_mod(PyObject *v, PyObject *w)
6259{
6260 if (!PyUnicode_Check(v)) {
6261 Py_INCREF(Py_NotImplemented);
6262 return Py_NotImplemented;
6263 }
6264 return PyUnicode_Format(v, w);
6265}
6266
6267static PyNumberMethods unicode_as_number = {
6268 0, /*nb_add*/
6269 0, /*nb_subtract*/
6270 0, /*nb_multiply*/
6271 0, /*nb_divide*/
6272 unicode_mod, /*nb_remainder*/
6273};
6274
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275static PySequenceMethods unicode_as_sequence = {
6276 (inquiry) unicode_length, /* sq_length */
6277 (binaryfunc) PyUnicode_Concat, /* sq_concat */
6278 (intargfunc) unicode_repeat, /* sq_repeat */
6279 (intargfunc) unicode_getitem, /* sq_item */
6280 (intintargfunc) unicode_slice, /* sq_slice */
6281 0, /* sq_ass_item */
6282 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00006283 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284};
6285
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006286static PyObject*
6287unicode_subscript(PyUnicodeObject* self, PyObject* item)
6288{
6289 if (PyInt_Check(item)) {
6290 long i = PyInt_AS_LONG(item);
6291 if (i < 0)
6292 i += PyString_GET_SIZE(self);
6293 return unicode_getitem(self, i);
6294 } else if (PyLong_Check(item)) {
6295 long i = PyLong_AsLong(item);
6296 if (i == -1 && PyErr_Occurred())
6297 return NULL;
6298 if (i < 0)
6299 i += PyString_GET_SIZE(self);
6300 return unicode_getitem(self, i);
6301 } else if (PySlice_Check(item)) {
6302 int start, stop, step, slicelength, cur, i;
6303 Py_UNICODE* source_buf;
6304 Py_UNICODE* result_buf;
6305 PyObject* result;
6306
6307 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
6308 &start, &stop, &step, &slicelength) < 0) {
6309 return NULL;
6310 }
6311
6312 if (slicelength <= 0) {
6313 return PyUnicode_FromUnicode(NULL, 0);
6314 } else {
6315 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6316 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
6317
6318 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6319 result_buf[i] = source_buf[cur];
6320 }
Tim Petersced69f82003-09-16 20:30:58 +00006321
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006322 result = PyUnicode_FromUnicode(result_buf, slicelength);
6323 PyMem_FREE(result_buf);
6324 return result;
6325 }
6326 } else {
6327 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6328 return NULL;
6329 }
6330}
6331
6332static PyMappingMethods unicode_as_mapping = {
6333 (inquiry)unicode_length, /* mp_length */
6334 (binaryfunc)unicode_subscript, /* mp_subscript */
6335 (objobjargproc)0, /* mp_ass_subscript */
6336};
6337
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338static int
6339unicode_buffer_getreadbuf(PyUnicodeObject *self,
6340 int index,
6341 const void **ptr)
6342{
6343 if (index != 0) {
6344 PyErr_SetString(PyExc_SystemError,
6345 "accessing non-existent unicode segment");
6346 return -1;
6347 }
6348 *ptr = (void *) self->str;
6349 return PyUnicode_GET_DATA_SIZE(self);
6350}
6351
6352static int
6353unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6354 const void **ptr)
6355{
6356 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006357 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358 return -1;
6359}
6360
6361static int
6362unicode_buffer_getsegcount(PyUnicodeObject *self,
6363 int *lenp)
6364{
6365 if (lenp)
6366 *lenp = PyUnicode_GET_DATA_SIZE(self);
6367 return 1;
6368}
6369
6370static int
6371unicode_buffer_getcharbuf(PyUnicodeObject *self,
6372 int index,
6373 const void **ptr)
6374{
6375 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006376
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377 if (index != 0) {
6378 PyErr_SetString(PyExc_SystemError,
6379 "accessing non-existent unicode segment");
6380 return -1;
6381 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006382 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383 if (str == NULL)
6384 return -1;
6385 *ptr = (void *) PyString_AS_STRING(str);
6386 return PyString_GET_SIZE(str);
6387}
6388
6389/* Helpers for PyUnicode_Format() */
6390
6391static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006392getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393{
6394 int argidx = *p_argidx;
6395 if (argidx < arglen) {
6396 (*p_argidx)++;
6397 if (arglen < 0)
6398 return args;
6399 else
6400 return PyTuple_GetItem(args, argidx);
6401 }
6402 PyErr_SetString(PyExc_TypeError,
6403 "not enough arguments for format string");
6404 return NULL;
6405}
6406
6407#define F_LJUST (1<<0)
6408#define F_SIGN (1<<1)
6409#define F_BLANK (1<<2)
6410#define F_ALT (1<<3)
6411#define F_ZERO (1<<4)
6412
6413static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415{
6416 register int i;
6417 int len;
6418 va_list va;
6419 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421
6422 /* First, format the string as char array, then expand to Py_UNICODE
6423 array. */
6424 charbuffer = (char *)buffer;
6425 len = vsprintf(charbuffer, format, va);
6426 for (i = len - 1; i >= 0; i--)
6427 buffer[i] = (Py_UNICODE) charbuffer[i];
6428
6429 va_end(va);
6430 return len;
6431}
6432
Guido van Rossum078151d2002-08-11 04:24:12 +00006433/* XXX To save some code duplication, formatfloat/long/int could have been
6434 shared with stringobject.c, converting from 8-bit to Unicode after the
6435 formatting is done. */
6436
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437static int
6438formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006439 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 int flags,
6441 int prec,
6442 int type,
6443 PyObject *v)
6444{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006445 /* fmt = '%#.' + `prec` + `type`
6446 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 char fmt[20];
6448 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006449
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 x = PyFloat_AsDouble(v);
6451 if (x == -1.0 && PyErr_Occurred())
6452 return -1;
6453 if (prec < 0)
6454 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6456 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006457 /* Worst case length calc to ensure no buffer overrun:
6458
6459 'g' formats:
6460 fmt = %#.<prec>g
6461 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6462 for any double rep.)
6463 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6464
6465 'f' formats:
6466 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6467 len = 1 + 50 + 1 + prec = 52 + prec
6468
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006469 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006470 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006471
6472 */
6473 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6474 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006475 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006476 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006477 return -1;
6478 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006479 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6480 (flags&F_ALT) ? "#" : "",
6481 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 return usprintf(buf, fmt, x);
6483}
6484
Tim Peters38fd5b62000-09-21 05:43:11 +00006485static PyObject*
6486formatlong(PyObject *val, int flags, int prec, int type)
6487{
6488 char *buf;
6489 int i, len;
6490 PyObject *str; /* temporary string object. */
6491 PyUnicodeObject *result;
6492
6493 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6494 if (!str)
6495 return NULL;
6496 result = _PyUnicode_New(len);
6497 for (i = 0; i < len; i++)
6498 result->str[i] = buf[i];
6499 result->str[len] = 0;
6500 Py_DECREF(str);
6501 return (PyObject*)result;
6502}
6503
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504static int
6505formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006506 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507 int flags,
6508 int prec,
6509 int type,
6510 PyObject *v)
6511{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006512 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006513 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6514 * + 1 + 1
6515 * = 24
6516 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006517 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006518 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 long x;
6520
6521 x = PyInt_AsLong(v);
6522 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006523 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006524 if (x < 0 && type == 'u') {
6525 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006526 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006527 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6528 sign = "-";
6529 else
6530 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006532 prec = 1;
6533
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006534 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6535 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006536 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006537 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006538 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006539 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006540 return -1;
6541 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006542
6543 if ((flags & F_ALT) &&
6544 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006545 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006546 * of issues that cause pain:
6547 * - when 0 is being converted, the C standard leaves off
6548 * the '0x' or '0X', which is inconsistent with other
6549 * %#x/%#X conversions and inconsistent with Python's
6550 * hex() function
6551 * - there are platforms that violate the standard and
6552 * convert 0 with the '0x' or '0X'
6553 * (Metrowerks, Compaq Tru64)
6554 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006555 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006556 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006557 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006558 * We can achieve the desired consistency by inserting our
6559 * own '0x' or '0X' prefix, and substituting %x/%X in place
6560 * of %#x/%#X.
6561 *
6562 * Note that this is the same approach as used in
6563 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006564 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006565 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6566 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006567 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006568 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006569 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6570 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006571 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006572 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006573 if (sign[0])
6574 return usprintf(buf, fmt, -x);
6575 else
6576 return usprintf(buf, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577}
6578
6579static int
6580formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006581 size_t buflen,
6582 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006584 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006585 if (PyUnicode_Check(v)) {
6586 if (PyUnicode_GET_SIZE(v) != 1)
6587 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006589 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006591 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006592 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006593 goto onError;
6594 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596
6597 else {
6598 /* Integer input truncated to a character */
6599 long x;
6600 x = PyInt_AsLong(v);
6601 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006602 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006603#ifdef Py_UNICODE_WIDE
6604 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006605 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006606 "%c arg not in range(0x110000) "
6607 "(wide Python build)");
6608 return -1;
6609 }
6610#else
6611 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006612 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006613 "%c arg not in range(0x10000) "
6614 "(narrow Python build)");
6615 return -1;
6616 }
6617#endif
6618 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619 }
6620 buf[1] = '\0';
6621 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006622
6623 onError:
6624 PyErr_SetString(PyExc_TypeError,
6625 "%c requires int or char");
6626 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627}
6628
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006629/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6630
6631 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6632 chars are formatted. XXX This is a magic number. Each formatting
6633 routine does bounds checking to ensure no overflow, but a better
6634 solution may be to malloc a buffer of appropriate size for each
6635 format. For now, the current solution is sufficient.
6636*/
6637#define FORMATBUFLEN (size_t)120
6638
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639PyObject *PyUnicode_Format(PyObject *format,
6640 PyObject *args)
6641{
6642 Py_UNICODE *fmt, *res;
6643 int fmtcnt, rescnt, reslen, arglen, argidx;
6644 int args_owned = 0;
6645 PyUnicodeObject *result = NULL;
6646 PyObject *dict = NULL;
6647 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006648
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649 if (format == NULL || args == NULL) {
6650 PyErr_BadInternalCall();
6651 return NULL;
6652 }
6653 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006654 if (uformat == NULL)
6655 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656 fmt = PyUnicode_AS_UNICODE(uformat);
6657 fmtcnt = PyUnicode_GET_SIZE(uformat);
6658
6659 reslen = rescnt = fmtcnt + 100;
6660 result = _PyUnicode_New(reslen);
6661 if (result == NULL)
6662 goto onError;
6663 res = PyUnicode_AS_UNICODE(result);
6664
6665 if (PyTuple_Check(args)) {
6666 arglen = PyTuple_Size(args);
6667 argidx = 0;
6668 }
6669 else {
6670 arglen = -1;
6671 argidx = -2;
6672 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006673 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6674 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675 dict = args;
6676
6677 while (--fmtcnt >= 0) {
6678 if (*fmt != '%') {
6679 if (--rescnt < 0) {
6680 rescnt = fmtcnt + 100;
6681 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006682 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 return NULL;
6684 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6685 --rescnt;
6686 }
6687 *res++ = *fmt++;
6688 }
6689 else {
6690 /* Got a format specifier */
6691 int flags = 0;
6692 int width = -1;
6693 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694 Py_UNICODE c = '\0';
6695 Py_UNICODE fill;
6696 PyObject *v = NULL;
6697 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006698 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699 Py_UNICODE sign;
6700 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006701 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702
6703 fmt++;
6704 if (*fmt == '(') {
6705 Py_UNICODE *keystart;
6706 int keylen;
6707 PyObject *key;
6708 int pcount = 1;
6709
6710 if (dict == NULL) {
6711 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006712 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 goto onError;
6714 }
6715 ++fmt;
6716 --fmtcnt;
6717 keystart = fmt;
6718 /* Skip over balanced parentheses */
6719 while (pcount > 0 && --fmtcnt >= 0) {
6720 if (*fmt == ')')
6721 --pcount;
6722 else if (*fmt == '(')
6723 ++pcount;
6724 fmt++;
6725 }
6726 keylen = fmt - keystart - 1;
6727 if (fmtcnt < 0 || pcount > 0) {
6728 PyErr_SetString(PyExc_ValueError,
6729 "incomplete format key");
6730 goto onError;
6731 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006732#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006733 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 then looked up since Python uses strings to hold
6735 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006736 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 key = PyUnicode_EncodeUTF8(keystart,
6738 keylen,
6739 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006740#else
6741 key = PyUnicode_FromUnicode(keystart, keylen);
6742#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 if (key == NULL)
6744 goto onError;
6745 if (args_owned) {
6746 Py_DECREF(args);
6747 args_owned = 0;
6748 }
6749 args = PyObject_GetItem(dict, key);
6750 Py_DECREF(key);
6751 if (args == NULL) {
6752 goto onError;
6753 }
6754 args_owned = 1;
6755 arglen = -1;
6756 argidx = -2;
6757 }
6758 while (--fmtcnt >= 0) {
6759 switch (c = *fmt++) {
6760 case '-': flags |= F_LJUST; continue;
6761 case '+': flags |= F_SIGN; continue;
6762 case ' ': flags |= F_BLANK; continue;
6763 case '#': flags |= F_ALT; continue;
6764 case '0': flags |= F_ZERO; continue;
6765 }
6766 break;
6767 }
6768 if (c == '*') {
6769 v = getnextarg(args, arglen, &argidx);
6770 if (v == NULL)
6771 goto onError;
6772 if (!PyInt_Check(v)) {
6773 PyErr_SetString(PyExc_TypeError,
6774 "* wants int");
6775 goto onError;
6776 }
6777 width = PyInt_AsLong(v);
6778 if (width < 0) {
6779 flags |= F_LJUST;
6780 width = -width;
6781 }
6782 if (--fmtcnt >= 0)
6783 c = *fmt++;
6784 }
6785 else if (c >= '0' && c <= '9') {
6786 width = c - '0';
6787 while (--fmtcnt >= 0) {
6788 c = *fmt++;
6789 if (c < '0' || c > '9')
6790 break;
6791 if ((width*10) / 10 != width) {
6792 PyErr_SetString(PyExc_ValueError,
6793 "width too big");
6794 goto onError;
6795 }
6796 width = width*10 + (c - '0');
6797 }
6798 }
6799 if (c == '.') {
6800 prec = 0;
6801 if (--fmtcnt >= 0)
6802 c = *fmt++;
6803 if (c == '*') {
6804 v = getnextarg(args, arglen, &argidx);
6805 if (v == NULL)
6806 goto onError;
6807 if (!PyInt_Check(v)) {
6808 PyErr_SetString(PyExc_TypeError,
6809 "* wants int");
6810 goto onError;
6811 }
6812 prec = PyInt_AsLong(v);
6813 if (prec < 0)
6814 prec = 0;
6815 if (--fmtcnt >= 0)
6816 c = *fmt++;
6817 }
6818 else if (c >= '0' && c <= '9') {
6819 prec = c - '0';
6820 while (--fmtcnt >= 0) {
6821 c = Py_CHARMASK(*fmt++);
6822 if (c < '0' || c > '9')
6823 break;
6824 if ((prec*10) / 10 != prec) {
6825 PyErr_SetString(PyExc_ValueError,
6826 "prec too big");
6827 goto onError;
6828 }
6829 prec = prec*10 + (c - '0');
6830 }
6831 }
6832 } /* prec */
6833 if (fmtcnt >= 0) {
6834 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835 if (--fmtcnt >= 0)
6836 c = *fmt++;
6837 }
6838 }
6839 if (fmtcnt < 0) {
6840 PyErr_SetString(PyExc_ValueError,
6841 "incomplete format");
6842 goto onError;
6843 }
6844 if (c != '%') {
6845 v = getnextarg(args, arglen, &argidx);
6846 if (v == NULL)
6847 goto onError;
6848 }
6849 sign = 0;
6850 fill = ' ';
6851 switch (c) {
6852
6853 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006854 pbuf = formatbuf;
6855 /* presume that buffer length is at least 1 */
6856 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857 len = 1;
6858 break;
6859
6860 case 's':
6861 case 'r':
6862 if (PyUnicode_Check(v) && c == 's') {
6863 temp = v;
6864 Py_INCREF(temp);
6865 }
6866 else {
6867 PyObject *unicode;
6868 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006869 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 else
6871 temp = PyObject_Repr(v);
6872 if (temp == NULL)
6873 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006874 if (PyUnicode_Check(temp))
6875 /* nothing to do */;
6876 else if (PyString_Check(temp)) {
6877 /* convert to string to Unicode */
Fred Drakee4315f52000-05-09 19:53:39 +00006878 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006880 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881 "strict");
6882 Py_DECREF(temp);
6883 temp = unicode;
6884 if (temp == NULL)
6885 goto onError;
6886 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006887 else {
6888 Py_DECREF(temp);
6889 PyErr_SetString(PyExc_TypeError,
6890 "%s argument has non-string str()");
6891 goto onError;
6892 }
6893 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006894 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895 len = PyUnicode_GET_SIZE(temp);
6896 if (prec >= 0 && len > prec)
6897 len = prec;
6898 break;
6899
6900 case 'i':
6901 case 'd':
6902 case 'u':
6903 case 'o':
6904 case 'x':
6905 case 'X':
6906 if (c == 'i')
6907 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006908 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006909 temp = formatlong(v, flags, prec, c);
6910 if (!temp)
6911 goto onError;
6912 pbuf = PyUnicode_AS_UNICODE(temp);
6913 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00006914 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006916 else {
6917 pbuf = formatbuf;
6918 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6919 flags, prec, c, v);
6920 if (len < 0)
6921 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006922 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006923 }
6924 if (flags & F_ZERO)
6925 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 break;
6927
6928 case 'e':
6929 case 'E':
6930 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006931 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932 case 'g':
6933 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006934 if (c == 'F')
6935 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006936 pbuf = formatbuf;
6937 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6938 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939 if (len < 0)
6940 goto onError;
6941 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006942 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943 fill = '0';
6944 break;
6945
6946 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006947 pbuf = formatbuf;
6948 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949 if (len < 0)
6950 goto onError;
6951 break;
6952
6953 default:
6954 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006955 "unsupported format character '%c' (0x%x) "
6956 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00006957 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006958 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006959 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960 goto onError;
6961 }
6962 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006963 if (*pbuf == '-' || *pbuf == '+') {
6964 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965 len--;
6966 }
6967 else if (flags & F_SIGN)
6968 sign = '+';
6969 else if (flags & F_BLANK)
6970 sign = ' ';
6971 else
6972 sign = 0;
6973 }
6974 if (width < len)
6975 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006976 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977 reslen -= rescnt;
6978 rescnt = width + fmtcnt + 100;
6979 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006980 if (reslen < 0) {
6981 Py_DECREF(result);
6982 return PyErr_NoMemory();
6983 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006984 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985 return NULL;
6986 res = PyUnicode_AS_UNICODE(result)
6987 + reslen - rescnt;
6988 }
6989 if (sign) {
6990 if (fill != ' ')
6991 *res++ = sign;
6992 rescnt--;
6993 if (width > len)
6994 width--;
6995 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006996 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6997 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006998 assert(pbuf[1] == c);
6999 if (fill != ' ') {
7000 *res++ = *pbuf++;
7001 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007002 }
Tim Petersfff53252001-04-12 18:38:48 +00007003 rescnt -= 2;
7004 width -= 2;
7005 if (width < 0)
7006 width = 0;
7007 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007008 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009 if (width > len && !(flags & F_LJUST)) {
7010 do {
7011 --rescnt;
7012 *res++ = fill;
7013 } while (--width > len);
7014 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007015 if (fill == ' ') {
7016 if (sign)
7017 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007018 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007019 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007020 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007021 *res++ = *pbuf++;
7022 *res++ = *pbuf++;
7023 }
7024 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007025 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 res += len;
7027 rescnt -= len;
7028 while (--width >= len) {
7029 --rescnt;
7030 *res++ = ' ';
7031 }
7032 if (dict && (argidx < arglen) && c != '%') {
7033 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007034 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035 goto onError;
7036 }
7037 Py_XDECREF(temp);
7038 } /* '%' */
7039 } /* until end */
7040 if (argidx < arglen && !dict) {
7041 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007042 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043 goto onError;
7044 }
7045
7046 if (args_owned) {
7047 Py_DECREF(args);
7048 }
7049 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00007050 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007051 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052 return (PyObject *)result;
7053
7054 onError:
7055 Py_XDECREF(result);
7056 Py_DECREF(uformat);
7057 if (args_owned) {
7058 Py_DECREF(args);
7059 }
7060 return NULL;
7061}
7062
7063static PyBufferProcs unicode_as_buffer = {
7064 (getreadbufferproc) unicode_buffer_getreadbuf,
7065 (getwritebufferproc) unicode_buffer_getwritebuf,
7066 (getsegcountproc) unicode_buffer_getsegcount,
7067 (getcharbufferproc) unicode_buffer_getcharbuf,
7068};
7069
Jeremy Hylton938ace62002-07-17 16:30:39 +00007070static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007071unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7072
Tim Peters6d6c1a32001-08-02 04:15:00 +00007073static PyObject *
7074unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7075{
7076 PyObject *x = NULL;
7077 static char *kwlist[] = {"string", "encoding", "errors", 0};
7078 char *encoding = NULL;
7079 char *errors = NULL;
7080
Guido van Rossume023fe02001-08-30 03:12:59 +00007081 if (type != &PyUnicode_Type)
7082 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007083 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7084 kwlist, &x, &encoding, &errors))
7085 return NULL;
7086 if (x == NULL)
7087 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007088 if (encoding == NULL && errors == NULL)
7089 return PyObject_Unicode(x);
7090 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007091 return PyUnicode_FromEncodedObject(x, encoding, errors);
7092}
7093
Guido van Rossume023fe02001-08-30 03:12:59 +00007094static PyObject *
7095unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7096{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007097 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007098 int n;
7099
7100 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7101 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7102 if (tmp == NULL)
7103 return NULL;
7104 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007105 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007106 if (pnew == NULL) {
7107 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007108 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007109 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007110 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7111 if (pnew->str == NULL) {
7112 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007113 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007114 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007115 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007116 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007117 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7118 pnew->length = n;
7119 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007120 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007121 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007122}
7123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007124PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007125"unicode(string [, encoding[, errors]]) -> object\n\
7126\n\
7127Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007128encoding defaults to the current default string encoding.\n\
7129errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007130
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131PyTypeObject PyUnicode_Type = {
7132 PyObject_HEAD_INIT(&PyType_Type)
7133 0, /* ob_size */
7134 "unicode", /* tp_name */
7135 sizeof(PyUnicodeObject), /* tp_size */
7136 0, /* tp_itemsize */
7137 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007138 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007140 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141 0, /* tp_setattr */
7142 (cmpfunc) unicode_compare, /* tp_compare */
7143 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007144 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007146 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147 (hashfunc) unicode_hash, /* tp_hash*/
7148 0, /* tp_call*/
7149 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007150 PyObject_GenericGetAttr, /* tp_getattro */
7151 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007153 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7154 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007155 unicode_doc, /* tp_doc */
7156 0, /* tp_traverse */
7157 0, /* tp_clear */
7158 0, /* tp_richcompare */
7159 0, /* tp_weaklistoffset */
7160 0, /* tp_iter */
7161 0, /* tp_iternext */
7162 unicode_methods, /* tp_methods */
7163 0, /* tp_members */
7164 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007165 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007166 0, /* tp_dict */
7167 0, /* tp_descr_get */
7168 0, /* tp_descr_set */
7169 0, /* tp_dictoffset */
7170 0, /* tp_init */
7171 0, /* tp_alloc */
7172 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007173 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174};
7175
7176/* Initialize the Unicode implementation */
7177
Thomas Wouters78890102000-07-22 19:25:51 +00007178void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007180 int i;
7181
Fred Drakee4315f52000-05-09 19:53:39 +00007182 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007183 unicode_freelist = NULL;
7184 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007186 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007187 for (i = 0; i < 256; i++)
7188 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007189 if (PyType_Ready(&PyUnicode_Type) < 0)
7190 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191}
7192
7193/* Finalize the Unicode implementation */
7194
7195void
Thomas Wouters78890102000-07-22 19:25:51 +00007196_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007198 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007199 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007201 Py_XDECREF(unicode_empty);
7202 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007204 for (i = 0; i < 256; i++) {
7205 if (unicode_latin1[i]) {
7206 Py_DECREF(unicode_latin1[i]);
7207 unicode_latin1[i] = NULL;
7208 }
7209 }
7210
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007211 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212 PyUnicodeObject *v = u;
7213 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007214 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007215 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007216 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007217 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007219 unicode_freelist = NULL;
7220 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007222
7223/*
7224Local variables:
7225c-basic-offset: 4
7226indent-tabs-mode: nil
7227End:
7228*/