blob: f78788eb9adf5baa0de0990f34f7bf8ed458cdbd [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
157 unicode->length = length;
158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000266int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
371 for (i = size; i >= 0; i--)
372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
387 if (size > PyUnicode_GET_SIZE(unicode))
388 size = PyUnicode_GET_SIZE(unicode);
389#ifdef HAVE_USABLE_WCHAR_T
390 memcpy(w, unicode->str, size * sizeof(wchar_t));
391#else
392 {
393 register Py_UNICODE *u;
394 register int i;
395 u = PyUnicode_AS_UNICODE(unicode);
396 for (i = size; i >= 0; i--)
397 *w++ = *u++;
398 }
399#endif
400
401 return size;
402}
403
404#endif
405
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000406PyObject *PyUnicode_FromOrdinal(int ordinal)
407{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000408 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000409
410#ifdef Py_UNICODE_WIDE
411 if (ordinal < 0 || ordinal > 0x10ffff) {
412 PyErr_SetString(PyExc_ValueError,
413 "unichr() arg not in range(0x110000) "
414 "(wide Python build)");
415 return NULL;
416 }
417#else
418 if (ordinal < 0 || ordinal > 0xffff) {
419 PyErr_SetString(PyExc_ValueError,
420 "unichr() arg not in range(0x10000) "
421 "(narrow Python build)");
422 return NULL;
423 }
424#endif
425
Hye-Shik Chang40574832004-04-06 07:24:51 +0000426 s[0] = (Py_UNICODE)ordinal;
427 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000428}
429
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430PyObject *PyUnicode_FromObject(register PyObject *obj)
431{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000432 /* XXX Perhaps we should make this API an alias of
433 PyObject_Unicode() instead ?! */
434 if (PyUnicode_CheckExact(obj)) {
435 Py_INCREF(obj);
436 return obj;
437 }
438 if (PyUnicode_Check(obj)) {
439 /* For a Unicode subtype that's not a Unicode object,
440 return a true Unicode object with the same data. */
441 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
442 PyUnicode_GET_SIZE(obj));
443 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000444 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
445}
446
447PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
448 const char *encoding,
449 const char *errors)
450{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000451 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000453 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000454
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455 if (obj == NULL) {
456 PyErr_BadInternalCall();
457 return NULL;
458 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000460#if 0
461 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000462 that no encodings is given and then redirect to
463 PyObject_Unicode() which then applies the additional logic for
464 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000465
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000466 NOTE: This API should really only be used for object which
467 represent *encoded* Unicode !
468
469 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000470 if (PyUnicode_Check(obj)) {
471 if (encoding) {
472 PyErr_SetString(PyExc_TypeError,
473 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000474 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000475 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000476 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000477 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000478#else
479 if (PyUnicode_Check(obj)) {
480 PyErr_SetString(PyExc_TypeError,
481 "decoding Unicode is not supported");
482 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000484#endif
485
486 /* Coerce object */
487 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000488 s = PyString_AS_STRING(obj);
489 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000490 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
492 /* Overwrite the error message with something more useful in
493 case of a TypeError. */
494 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000495 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000496 "coercing to Unicode: need string or buffer, "
497 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000498 obj->ob_type->tp_name);
499 goto onError;
500 }
Tim Petersced69f82003-09-16 20:30:58 +0000501
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000502 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 if (len == 0) {
504 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000505 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000509
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000510 return v;
511
512 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514}
515
516PyObject *PyUnicode_Decode(const char *s,
517 int size,
518 const char *encoding,
519 const char *errors)
520{
521 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000522
523 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000524 encoding = PyUnicode_GetDefaultEncoding();
525
526 /* Shortcuts for common default encodings */
527 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000529 else if (strcmp(encoding, "latin-1") == 0)
530 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000531#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
532 else if (strcmp(encoding, "mbcs") == 0)
533 return PyUnicode_DecodeMBCS(s, size, errors);
534#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000535 else if (strcmp(encoding, "ascii") == 0)
536 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000537
538 /* Decode via the codec registry */
539 buffer = PyBuffer_FromMemory((void *)s, size);
540 if (buffer == NULL)
541 goto onError;
542 unicode = PyCodec_Decode(buffer, encoding, errors);
543 if (unicode == NULL)
544 goto onError;
545 if (!PyUnicode_Check(unicode)) {
546 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000547 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548 unicode->ob_type->tp_name);
549 Py_DECREF(unicode);
550 goto onError;
551 }
552 Py_DECREF(buffer);
553 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000554
Guido van Rossumd57fd912000-03-10 22:53:23 +0000555 onError:
556 Py_XDECREF(buffer);
557 return NULL;
558}
559
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000560PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
561 const char *encoding,
562 const char *errors)
563{
564 PyObject *v;
565
566 if (!PyUnicode_Check(unicode)) {
567 PyErr_BadArgument();
568 goto onError;
569 }
570
571 if (encoding == NULL)
572 encoding = PyUnicode_GetDefaultEncoding();
573
574 /* Decode via the codec registry */
575 v = PyCodec_Decode(unicode, encoding, errors);
576 if (v == NULL)
577 goto onError;
578 return v;
579
580 onError:
581 return NULL;
582}
583
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584PyObject *PyUnicode_Encode(const Py_UNICODE *s,
585 int size,
586 const char *encoding,
587 const char *errors)
588{
589 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000590
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591 unicode = PyUnicode_FromUnicode(s, size);
592 if (unicode == NULL)
593 return NULL;
594 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
595 Py_DECREF(unicode);
596 return v;
597}
598
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000599PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
600 const char *encoding,
601 const char *errors)
602{
603 PyObject *v;
604
605 if (!PyUnicode_Check(unicode)) {
606 PyErr_BadArgument();
607 goto onError;
608 }
609
610 if (encoding == NULL)
611 encoding = PyUnicode_GetDefaultEncoding();
612
613 /* Encode via the codec registry */
614 v = PyCodec_Encode(unicode, encoding, errors);
615 if (v == NULL)
616 goto onError;
617 return v;
618
619 onError:
620 return NULL;
621}
622
Guido van Rossumd57fd912000-03-10 22:53:23 +0000623PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
624 const char *encoding,
625 const char *errors)
626{
627 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629 if (!PyUnicode_Check(unicode)) {
630 PyErr_BadArgument();
631 goto onError;
632 }
Fred Drakee4315f52000-05-09 19:53:39 +0000633
Tim Petersced69f82003-09-16 20:30:58 +0000634 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000635 encoding = PyUnicode_GetDefaultEncoding();
636
637 /* Shortcuts for common default encodings */
638 if (errors == NULL) {
639 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000640 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000641 else if (strcmp(encoding, "latin-1") == 0)
642 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000643#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
644 else if (strcmp(encoding, "mbcs") == 0)
645 return PyUnicode_AsMBCSString(unicode);
646#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000647 else if (strcmp(encoding, "ascii") == 0)
648 return PyUnicode_AsASCIIString(unicode);
649 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650
651 /* Encode via the codec registry */
652 v = PyCodec_Encode(unicode, encoding, errors);
653 if (v == NULL)
654 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 if (!PyString_Check(v)) {
656 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000657 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 v->ob_type->tp_name);
659 Py_DECREF(v);
660 goto onError;
661 }
662 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000663
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 onError:
665 return NULL;
666}
667
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000668PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
669 const char *errors)
670{
671 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
672
673 if (v)
674 return v;
675 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
676 if (v && errors == NULL)
677 ((PyUnicodeObject *)unicode)->defenc = v;
678 return v;
679}
680
Guido van Rossumd57fd912000-03-10 22:53:23 +0000681Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
682{
683 if (!PyUnicode_Check(unicode)) {
684 PyErr_BadArgument();
685 goto onError;
686 }
687 return PyUnicode_AS_UNICODE(unicode);
688
689 onError:
690 return NULL;
691}
692
693int PyUnicode_GetSize(PyObject *unicode)
694{
695 if (!PyUnicode_Check(unicode)) {
696 PyErr_BadArgument();
697 goto onError;
698 }
699 return PyUnicode_GET_SIZE(unicode);
700
701 onError:
702 return -1;
703}
704
Thomas Wouters78890102000-07-22 19:25:51 +0000705const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000706{
707 return unicode_default_encoding;
708}
709
710int PyUnicode_SetDefaultEncoding(const char *encoding)
711{
712 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000713
Fred Drakee4315f52000-05-09 19:53:39 +0000714 /* Make sure the encoding is valid. As side effect, this also
715 loads the encoding into the codec registry cache. */
716 v = _PyCodec_Lookup(encoding);
717 if (v == NULL)
718 goto onError;
719 Py_DECREF(v);
720 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000721 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000722 sizeof(unicode_default_encoding));
723 return 0;
724
725 onError:
726 return -1;
727}
728
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000729/* error handling callback helper:
730 build arguments, call the callback and check the arguments,
731 if no exception occured, copy the replacement to the output
732 and adjust various state variables.
733 return 0 on success, -1 on error
734*/
735
736static
737int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
738 const char *encoding, const char *reason,
739 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
740 PyObject **output, int *outpos, Py_UNICODE **outptr)
741{
742 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
743
744 PyObject *restuple = NULL;
745 PyObject *repunicode = NULL;
746 int outsize = PyUnicode_GET_SIZE(*output);
747 int requiredsize;
748 int newpos;
749 Py_UNICODE *repptr;
750 int repsize;
751 int res = -1;
752
753 if (*errorHandler == NULL) {
754 *errorHandler = PyCodec_LookupError(errors);
755 if (*errorHandler == NULL)
756 goto onError;
757 }
758
759 if (*exceptionObject == NULL) {
760 *exceptionObject = PyUnicodeDecodeError_Create(
761 encoding, input, insize, *startinpos, *endinpos, reason);
762 if (*exceptionObject == NULL)
763 goto onError;
764 }
765 else {
766 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
767 goto onError;
768 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
769 goto onError;
770 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
771 goto onError;
772 }
773
774 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
775 if (restuple == NULL)
776 goto onError;
777 if (!PyTuple_Check(restuple)) {
778 PyErr_Format(PyExc_TypeError, &argparse[4]);
779 goto onError;
780 }
781 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
782 goto onError;
783 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000784 newpos = insize+newpos;
785 if (newpos<0 || newpos>insize) {
786 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
787 goto onError;
788 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000789
790 /* need more space? (at least enough for what we
791 have+the replacement+the rest of the string (starting
792 at the new input position), so we won't have to check space
793 when there are no errors in the rest of the string) */
794 repptr = PyUnicode_AS_UNICODE(repunicode);
795 repsize = PyUnicode_GET_SIZE(repunicode);
796 requiredsize = *outpos + repsize + insize-newpos;
797 if (requiredsize > outsize) {
798 if (requiredsize<2*outsize)
799 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000800 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000801 goto onError;
802 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
803 }
804 *endinpos = newpos;
805 *inptr = input + newpos;
806 Py_UNICODE_COPY(*outptr, repptr, repsize);
807 *outptr += repsize;
808 *outpos += repsize;
809 /* we made it! */
810 res = 0;
811
812 onError:
813 Py_XDECREF(restuple);
814 return res;
815}
816
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000817/* --- UTF-7 Codec -------------------------------------------------------- */
818
819/* see RFC2152 for details */
820
Tim Petersced69f82003-09-16 20:30:58 +0000821static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000822char utf7_special[128] = {
823 /* indicate whether a UTF-7 character is special i.e. cannot be directly
824 encoded:
825 0 - not special
826 1 - special
827 2 - whitespace (optional)
828 3 - RFC2152 Set O (optional) */
829 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
830 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
831 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
832 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
833 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
834 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
835 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
836 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
837
838};
839
840#define SPECIAL(c, encodeO, encodeWS) \
841 (((c)>127 || utf7_special[(c)] == 1) || \
842 (encodeWS && (utf7_special[(c)] == 2)) || \
843 (encodeO && (utf7_special[(c)] == 3)))
844
845#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
846#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
847#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
848 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
849
850#define ENCODE(out, ch, bits) \
851 while (bits >= 6) { \
852 *out++ = B64(ch >> (bits-6)); \
853 bits -= 6; \
854 }
855
856#define DECODE(out, ch, bits, surrogate) \
857 while (bits >= 16) { \
858 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
859 bits -= 16; \
860 if (surrogate) { \
861 /* We have already generated an error for the high surrogate
862 so let's not bother seeing if the low surrogate is correct or not */\
863 surrogate = 0; \
864 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
865 /* This is a surrogate pair. Unfortunately we can't represent \
866 it in a 16-bit character */ \
867 surrogate = 1; \
868 errmsg = "code pairs are not supported"; \
869 goto utf7Error; \
870 } else { \
871 *out++ = outCh; \
872 } \
873 } \
874
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000875PyObject *PyUnicode_DecodeUTF7(const char *s,
876 int size,
877 const char *errors)
878{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000879 const char *starts = s;
880 int startinpos;
881 int endinpos;
882 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000883 const char *e;
884 PyUnicodeObject *unicode;
885 Py_UNICODE *p;
886 const char *errmsg = "";
887 int inShift = 0;
888 unsigned int bitsleft = 0;
889 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000890 int surrogate = 0;
891 PyObject *errorHandler = NULL;
892 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000893
894 unicode = _PyUnicode_New(size);
895 if (!unicode)
896 return NULL;
897 if (size == 0)
898 return (PyObject *)unicode;
899
900 p = unicode->str;
901 e = s + size;
902
903 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000904 Py_UNICODE ch;
905 restart:
906 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907
908 if (inShift) {
909 if ((ch == '-') || !B64CHAR(ch)) {
910 inShift = 0;
911 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000912
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000913 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
914 if (bitsleft >= 6) {
915 /* The shift sequence has a partial character in it. If
916 bitsleft < 6 then we could just classify it as padding
917 but that is not the case here */
918
919 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000920 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921 }
922 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000923 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000924 here so indicate the potential of a misencoded character. */
925
926 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
927 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
928 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000929 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000930 }
931
932 if (ch == '-') {
933 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000934 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000935 inShift = 1;
936 }
937 } else if (SPECIAL(ch,0,0)) {
938 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000939 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000940 } else {
941 *p++ = ch;
942 }
943 } else {
944 charsleft = (charsleft << 6) | UB64(ch);
945 bitsleft += 6;
946 s++;
947 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
948 }
949 }
950 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000951 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000952 s++;
953 if (s < e && *s == '-') {
954 s++;
955 *p++ = '+';
956 } else
957 {
958 inShift = 1;
959 bitsleft = 0;
960 }
961 }
962 else if (SPECIAL(ch,0,0)) {
963 errmsg = "unexpected special character";
964 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000965 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000966 }
967 else {
968 *p++ = ch;
969 s++;
970 }
971 continue;
972 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000973 outpos = p-PyUnicode_AS_UNICODE(unicode);
974 endinpos = s-starts;
975 if (unicode_decode_call_errorhandler(
976 errors, &errorHandler,
977 "utf7", errmsg,
978 starts, size, &startinpos, &endinpos, &exc, &s,
979 (PyObject **)&unicode, &outpos, &p))
980 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000981 }
982
983 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000984 outpos = p-PyUnicode_AS_UNICODE(unicode);
985 endinpos = size;
986 if (unicode_decode_call_errorhandler(
987 errors, &errorHandler,
988 "utf7", "unterminated shift sequence",
989 starts, size, &startinpos, &endinpos, &exc, &s,
990 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000991 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000992 if (s < e)
993 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000994 }
995
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000996 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 goto onError;
998
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000999 Py_XDECREF(errorHandler);
1000 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001001 return (PyObject *)unicode;
1002
1003onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001004 Py_XDECREF(errorHandler);
1005 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001006 Py_DECREF(unicode);
1007 return NULL;
1008}
1009
1010
1011PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1012 int size,
1013 int encodeSetO,
1014 int encodeWhiteSpace,
1015 const char *errors)
1016{
1017 PyObject *v;
1018 /* It might be possible to tighten this worst case */
1019 unsigned int cbAllocated = 5 * size;
1020 int inShift = 0;
1021 int i = 0;
1022 unsigned int bitsleft = 0;
1023 unsigned long charsleft = 0;
1024 char * out;
1025 char * start;
1026
1027 if (size == 0)
1028 return PyString_FromStringAndSize(NULL, 0);
1029
1030 v = PyString_FromStringAndSize(NULL, cbAllocated);
1031 if (v == NULL)
1032 return NULL;
1033
1034 start = out = PyString_AS_STRING(v);
1035 for (;i < size; ++i) {
1036 Py_UNICODE ch = s[i];
1037
1038 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001039 if (ch == '+') {
1040 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001041 *out++ = '-';
1042 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1043 charsleft = ch;
1044 bitsleft = 16;
1045 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001046 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001047 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001048 } else {
1049 *out++ = (char) ch;
1050 }
1051 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001052 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1053 *out++ = B64(charsleft << (6-bitsleft));
1054 charsleft = 0;
1055 bitsleft = 0;
1056 /* Characters not in the BASE64 set implicitly unshift the sequence
1057 so no '-' is required, except if the character is itself a '-' */
1058 if (B64CHAR(ch) || ch == '-') {
1059 *out++ = '-';
1060 }
1061 inShift = 0;
1062 *out++ = (char) ch;
1063 } else {
1064 bitsleft += 16;
1065 charsleft = (charsleft << 16) | ch;
1066 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1067
1068 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001069 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001070 or '-' then the shift sequence will be terminated implicitly and we
1071 don't have to insert a '-'. */
1072
1073 if (bitsleft == 0) {
1074 if (i + 1 < size) {
1075 Py_UNICODE ch2 = s[i+1];
1076
1077 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001078
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001079 } else if (B64CHAR(ch2) || ch2 == '-') {
1080 *out++ = '-';
1081 inShift = 0;
1082 } else {
1083 inShift = 0;
1084 }
1085
1086 }
1087 else {
1088 *out++ = '-';
1089 inShift = 0;
1090 }
1091 }
Tim Petersced69f82003-09-16 20:30:58 +00001092 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001093 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001094 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001095 if (bitsleft) {
1096 *out++= B64(charsleft << (6-bitsleft) );
1097 *out++ = '-';
1098 }
1099
Tim Peters5de98422002-04-27 18:44:32 +00001100 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001101 return v;
1102}
1103
1104#undef SPECIAL
1105#undef B64
1106#undef B64CHAR
1107#undef UB64
1108#undef ENCODE
1109#undef DECODE
1110
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111/* --- UTF-8 Codec -------------------------------------------------------- */
1112
Tim Petersced69f82003-09-16 20:30:58 +00001113static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114char utf8_code_length[256] = {
1115 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1116 illegal prefix. see RFC 2279 for details */
1117 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1118 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1119 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1120 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1121 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1122 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1124 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1126 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1129 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1130 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1131 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1132 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1133};
1134
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135PyObject *PyUnicode_DecodeUTF8(const char *s,
1136 int size,
1137 const char *errors)
1138{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001139 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001141 int startinpos;
1142 int endinpos;
1143 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144 const char *e;
1145 PyUnicodeObject *unicode;
1146 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001147 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001148 PyObject *errorHandler = NULL;
1149 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150
1151 /* Note: size will always be longer than the resulting Unicode
1152 character count */
1153 unicode = _PyUnicode_New(size);
1154 if (!unicode)
1155 return NULL;
1156 if (size == 0)
1157 return (PyObject *)unicode;
1158
1159 /* Unpack UTF-8 encoded data */
1160 p = unicode->str;
1161 e = s + size;
1162
1163 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001164 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165
1166 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001167 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 s++;
1169 continue;
1170 }
1171
1172 n = utf8_code_length[ch];
1173
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001174 if (s + n > e) {
1175 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001176 startinpos = s-starts;
1177 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001178 goto utf8Error;
1179 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180
1181 switch (n) {
1182
1183 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001184 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001185 startinpos = s-starts;
1186 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001187 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188
1189 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001190 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001191 startinpos = s-starts;
1192 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001193 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194
1195 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001196 if ((s[1] & 0xc0) != 0x80) {
1197 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001198 startinpos = s-starts;
1199 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001200 goto utf8Error;
1201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001203 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001204 startinpos = s-starts;
1205 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001206 errmsg = "illegal encoding";
1207 goto utf8Error;
1208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001210 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 break;
1212
1213 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001214 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001215 (s[2] & 0xc0) != 0x80) {
1216 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001217 startinpos = s-starts;
1218 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001219 goto utf8Error;
1220 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001222 if (ch < 0x0800) {
1223 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001224 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001225
1226 XXX For wide builds (UCS-4) we should probably try
1227 to recombine the surrogates into a single code
1228 unit.
1229 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001230 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001231 startinpos = s-starts;
1232 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001233 goto utf8Error;
1234 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001236 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001237 break;
1238
1239 case 4:
1240 if ((s[1] & 0xc0) != 0x80 ||
1241 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001242 (s[3] & 0xc0) != 0x80) {
1243 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001244 startinpos = s-starts;
1245 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001246 goto utf8Error;
1247 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001248 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1249 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1250 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001251 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001252 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001253 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001254 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001255 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001256 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001257 startinpos = s-starts;
1258 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001259 goto utf8Error;
1260 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001261#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001262 *p++ = (Py_UNICODE)ch;
1263#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001264 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001265
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001266 /* translate from 10000..10FFFF to 0..FFFF */
1267 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001268
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001269 /* high surrogate = top 10 bits added to D800 */
1270 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001271
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001272 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001273 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001274#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275 break;
1276
1277 default:
1278 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001279 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 }
1284 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001286
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001287 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001288 outpos = p-PyUnicode_AS_UNICODE(unicode);
1289 if (unicode_decode_call_errorhandler(
1290 errors, &errorHandler,
1291 "utf8", errmsg,
1292 starts, size, &startinpos, &endinpos, &exc, &s,
1293 (PyObject **)&unicode, &outpos, &p))
1294 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001295 }
1296
1297 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001298 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001299 goto onError;
1300
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001301 Py_XDECREF(errorHandler);
1302 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 return (PyObject *)unicode;
1304
1305onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001306 Py_XDECREF(errorHandler);
1307 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 Py_DECREF(unicode);
1309 return NULL;
1310}
1311
Tim Peters602f7402002-04-27 18:03:26 +00001312/* Allocation strategy: if the string is short, convert into a stack buffer
1313 and allocate exactly as much space needed at the end. Else allocate the
1314 maximum possible needed (4 result bytes per Unicode character), and return
1315 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001316*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001317PyObject *
1318PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1319 int size,
1320 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321{
Tim Peters602f7402002-04-27 18:03:26 +00001322#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001323
Tim Peters602f7402002-04-27 18:03:26 +00001324 int i; /* index into s of next input byte */
1325 PyObject *v; /* result string object */
1326 char *p; /* next free byte in output buffer */
1327 int nallocated; /* number of result bytes allocated */
1328 int nneeded; /* number of result bytes needed */
1329 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001330
Tim Peters602f7402002-04-27 18:03:26 +00001331 assert(s != NULL);
1332 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001333
Tim Peters602f7402002-04-27 18:03:26 +00001334 if (size <= MAX_SHORT_UNICHARS) {
1335 /* Write into the stack buffer; nallocated can't overflow.
1336 * At the end, we'll allocate exactly as much heap space as it
1337 * turns out we need.
1338 */
1339 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1340 v = NULL; /* will allocate after we're done */
1341 p = stackbuf;
1342 }
1343 else {
1344 /* Overallocate on the heap, and give the excess back at the end. */
1345 nallocated = size * 4;
1346 if (nallocated / 4 != size) /* overflow! */
1347 return PyErr_NoMemory();
1348 v = PyString_FromStringAndSize(NULL, nallocated);
1349 if (v == NULL)
1350 return NULL;
1351 p = PyString_AS_STRING(v);
1352 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001353
Tim Peters602f7402002-04-27 18:03:26 +00001354 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001355 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001356
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001357 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001358 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001360
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001362 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001363 *p++ = (char)(0xc0 | (ch >> 6));
1364 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001365 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001366 else {
Tim Peters602f7402002-04-27 18:03:26 +00001367 /* Encode UCS2 Unicode ordinals */
1368 if (ch < 0x10000) {
1369 /* Special case: check for high surrogate */
1370 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1371 Py_UCS4 ch2 = s[i];
1372 /* Check for low surrogate and combine the two to
1373 form a UCS4 value */
1374 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001375 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001376 i++;
1377 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001378 }
Tim Peters602f7402002-04-27 18:03:26 +00001379 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001380 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001381 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001382 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1383 *p++ = (char)(0x80 | (ch & 0x3f));
1384 continue;
1385 }
1386encodeUCS4:
1387 /* Encode UCS4 Unicode ordinals */
1388 *p++ = (char)(0xf0 | (ch >> 18));
1389 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1390 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1391 *p++ = (char)(0x80 | (ch & 0x3f));
1392 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001394
Tim Peters602f7402002-04-27 18:03:26 +00001395 if (v == NULL) {
1396 /* This was stack allocated. */
1397 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1398 assert(nneeded <= nallocated);
1399 v = PyString_FromStringAndSize(stackbuf, nneeded);
1400 }
1401 else {
1402 /* Cut back to size actually needed. */
1403 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1404 assert(nneeded <= nallocated);
1405 _PyString_Resize(&v, nneeded);
1406 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001407 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001408
Tim Peters602f7402002-04-27 18:03:26 +00001409#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410}
1411
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1413{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001414 if (!PyUnicode_Check(unicode)) {
1415 PyErr_BadArgument();
1416 return NULL;
1417 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001418 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1419 PyUnicode_GET_SIZE(unicode),
1420 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421}
1422
1423/* --- UTF-16 Codec ------------------------------------------------------- */
1424
Tim Peters772747b2001-08-09 22:21:55 +00001425PyObject *
1426PyUnicode_DecodeUTF16(const char *s,
1427 int size,
1428 const char *errors,
1429 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001431 const char *starts = s;
1432 int startinpos;
1433 int endinpos;
1434 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435 PyUnicodeObject *unicode;
1436 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001437 const unsigned char *q, *e;
1438 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001439 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001440 /* Offsets from q for retrieving byte pairs in the right order. */
1441#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1442 int ihi = 1, ilo = 0;
1443#else
1444 int ihi = 0, ilo = 1;
1445#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001446 PyObject *errorHandler = NULL;
1447 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001448
1449 /* Note: size will always be longer than the resulting Unicode
1450 character count */
1451 unicode = _PyUnicode_New(size);
1452 if (!unicode)
1453 return NULL;
1454 if (size == 0)
1455 return (PyObject *)unicode;
1456
1457 /* Unpack UTF-16 encoded data */
1458 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001459 q = (unsigned char *)s;
1460 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461
1462 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001463 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001464
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001465 /* Check for BOM marks (U+FEFF) in the input and adjust current
1466 byte order setting accordingly. In native mode, the leading BOM
1467 mark is skipped, in all other modes, it is copied to the output
1468 stream as-is (giving a ZWNBSP character). */
1469 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001470 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001471#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001472 if (bom == 0xFEFF) {
1473 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001474 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001475 }
1476 else if (bom == 0xFFFE) {
1477 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001478 bo = 1;
1479 }
Tim Petersced69f82003-09-16 20:30:58 +00001480#else
Tim Peters772747b2001-08-09 22:21:55 +00001481 if (bom == 0xFEFF) {
1482 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001483 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001484 }
1485 else if (bom == 0xFFFE) {
1486 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001487 bo = -1;
1488 }
1489#endif
1490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491
Tim Peters772747b2001-08-09 22:21:55 +00001492 if (bo == -1) {
1493 /* force LE */
1494 ihi = 1;
1495 ilo = 0;
1496 }
1497 else if (bo == 1) {
1498 /* force BE */
1499 ihi = 0;
1500 ilo = 1;
1501 }
1502
1503 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001504 Py_UNICODE ch;
1505 /* remaing bytes at the end? (size should be even) */
1506 if (e-q<2) {
1507 errmsg = "truncated data";
1508 startinpos = ((const char *)q)-starts;
1509 endinpos = ((const char *)e)-starts;
1510 goto utf16Error;
1511 /* The remaining input chars are ignored if the callback
1512 chooses to skip the input */
1513 }
1514 ch = (q[ihi] << 8) | q[ilo];
1515
Tim Peters772747b2001-08-09 22:21:55 +00001516 q += 2;
1517
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518 if (ch < 0xD800 || ch > 0xDFFF) {
1519 *p++ = ch;
1520 continue;
1521 }
1522
1523 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001524 if (q >= e) {
1525 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001526 startinpos = (((const char *)q)-2)-starts;
1527 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001528 goto utf16Error;
1529 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001530 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001531 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1532 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001533 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001534#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001535 *p++ = ch;
1536 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001537#else
1538 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001539#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001540 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001541 }
1542 else {
1543 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001544 startinpos = (((const char *)q)-4)-starts;
1545 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001546 goto utf16Error;
1547 }
1548
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001550 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001551 startinpos = (((const char *)q)-2)-starts;
1552 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001553 /* Fall through to report the error */
1554
1555 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001556 outpos = p-PyUnicode_AS_UNICODE(unicode);
1557 if (unicode_decode_call_errorhandler(
1558 errors, &errorHandler,
1559 "utf16", errmsg,
1560 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1561 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001562 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563 }
1564
1565 if (byteorder)
1566 *byteorder = bo;
1567
1568 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001569 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001570 goto onError;
1571
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001572 Py_XDECREF(errorHandler);
1573 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001574 return (PyObject *)unicode;
1575
1576onError:
1577 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001578 Py_XDECREF(errorHandler);
1579 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001580 return NULL;
1581}
1582
Tim Peters772747b2001-08-09 22:21:55 +00001583PyObject *
1584PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1585 int size,
1586 const char *errors,
1587 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001588{
1589 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001590 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001591#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001592 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001593#else
1594 const int pairs = 0;
1595#endif
Tim Peters772747b2001-08-09 22:21:55 +00001596 /* Offsets from p for storing byte pairs in the right order. */
1597#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1598 int ihi = 1, ilo = 0;
1599#else
1600 int ihi = 0, ilo = 1;
1601#endif
1602
1603#define STORECHAR(CH) \
1604 do { \
1605 p[ihi] = ((CH) >> 8) & 0xff; \
1606 p[ilo] = (CH) & 0xff; \
1607 p += 2; \
1608 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001609
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001610#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001611 for (i = pairs = 0; i < size; i++)
1612 if (s[i] >= 0x10000)
1613 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001614#endif
Tim Petersced69f82003-09-16 20:30:58 +00001615 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001616 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001617 if (v == NULL)
1618 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619
Tim Peters772747b2001-08-09 22:21:55 +00001620 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001621 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001622 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001623 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001624 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001625
1626 if (byteorder == -1) {
1627 /* force LE */
1628 ihi = 1;
1629 ilo = 0;
1630 }
1631 else if (byteorder == 1) {
1632 /* force BE */
1633 ihi = 0;
1634 ilo = 1;
1635 }
1636
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001637 while (size-- > 0) {
1638 Py_UNICODE ch = *s++;
1639 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001640#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001641 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001642 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1643 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001644 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001645#endif
Tim Peters772747b2001-08-09 22:21:55 +00001646 STORECHAR(ch);
1647 if (ch2)
1648 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001649 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001650 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001651#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652}
1653
1654PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1655{
1656 if (!PyUnicode_Check(unicode)) {
1657 PyErr_BadArgument();
1658 return NULL;
1659 }
1660 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1661 PyUnicode_GET_SIZE(unicode),
1662 NULL,
1663 0);
1664}
1665
1666/* --- Unicode Escape Codec ----------------------------------------------- */
1667
Fredrik Lundh06d12682001-01-24 07:59:11 +00001668static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001669
Guido van Rossumd57fd912000-03-10 22:53:23 +00001670PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1671 int size,
1672 const char *errors)
1673{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001674 const char *starts = s;
1675 int startinpos;
1676 int endinpos;
1677 int outpos;
1678 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001680 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001682 char* message;
1683 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001684 PyObject *errorHandler = NULL;
1685 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001686
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687 /* Escaped strings will always be longer than the resulting
1688 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001689 length after conversion to the true value.
1690 (but if the error callback returns a long replacement string
1691 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 v = _PyUnicode_New(size);
1693 if (v == NULL)
1694 goto onError;
1695 if (size == 0)
1696 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001697
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001698 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001699 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001700
Guido van Rossumd57fd912000-03-10 22:53:23 +00001701 while (s < end) {
1702 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001703 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001704 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001705
1706 /* Non-escape characters are interpreted as Unicode ordinals */
1707 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001708 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001709 continue;
1710 }
1711
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001712 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713 /* \ - Escapes */
1714 s++;
1715 switch (*s++) {
1716
1717 /* \x escapes */
1718 case '\n': break;
1719 case '\\': *p++ = '\\'; break;
1720 case '\'': *p++ = '\''; break;
1721 case '\"': *p++ = '\"'; break;
1722 case 'b': *p++ = '\b'; break;
1723 case 'f': *p++ = '\014'; break; /* FF */
1724 case 't': *p++ = '\t'; break;
1725 case 'n': *p++ = '\n'; break;
1726 case 'r': *p++ = '\r'; break;
1727 case 'v': *p++ = '\013'; break; /* VT */
1728 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1729
1730 /* \OOO (octal) escapes */
1731 case '0': case '1': case '2': case '3':
1732 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001733 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001735 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001737 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001739 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001740 break;
1741
Fredrik Lundhccc74732001-02-18 22:13:49 +00001742 /* hex escapes */
1743 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001744 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001745 digits = 2;
1746 message = "truncated \\xXX escape";
1747 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748
Fredrik Lundhccc74732001-02-18 22:13:49 +00001749 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001751 digits = 4;
1752 message = "truncated \\uXXXX escape";
1753 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754
Fredrik Lundhccc74732001-02-18 22:13:49 +00001755 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001756 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001757 digits = 8;
1758 message = "truncated \\UXXXXXXXX escape";
1759 hexescape:
1760 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001761 outpos = p-PyUnicode_AS_UNICODE(v);
1762 if (s+digits>end) {
1763 endinpos = size;
1764 if (unicode_decode_call_errorhandler(
1765 errors, &errorHandler,
1766 "unicodeescape", "end of string in escape sequence",
1767 starts, size, &startinpos, &endinpos, &exc, &s,
1768 (PyObject **)&v, &outpos, &p))
1769 goto onError;
1770 goto nextByte;
1771 }
1772 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001773 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001774 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 endinpos = (s+i+1)-starts;
1776 if (unicode_decode_call_errorhandler(
1777 errors, &errorHandler,
1778 "unicodeescape", message,
1779 starts, size, &startinpos, &endinpos, &exc, &s,
1780 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001781 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001782 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001783 }
1784 chr = (chr<<4) & ~0xF;
1785 if (c >= '0' && c <= '9')
1786 chr += c - '0';
1787 else if (c >= 'a' && c <= 'f')
1788 chr += 10 + c - 'a';
1789 else
1790 chr += 10 + c - 'A';
1791 }
1792 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001793 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001794 /* _decoding_error will have already written into the
1795 target buffer. */
1796 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001797 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001798 /* when we get here, chr is a 32-bit unicode character */
1799 if (chr <= 0xffff)
1800 /* UCS-2 character */
1801 *p++ = (Py_UNICODE) chr;
1802 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001803 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001804 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001805#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001806 *p++ = chr;
1807#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001808 chr -= 0x10000L;
1809 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001810 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001811#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001812 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001813 endinpos = s-starts;
1814 outpos = p-PyUnicode_AS_UNICODE(v);
1815 if (unicode_decode_call_errorhandler(
1816 errors, &errorHandler,
1817 "unicodeescape", "illegal Unicode character",
1818 starts, size, &startinpos, &endinpos, &exc, &s,
1819 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001820 goto onError;
1821 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001822 break;
1823
1824 /* \N{name} */
1825 case 'N':
1826 message = "malformed \\N character escape";
1827 if (ucnhash_CAPI == NULL) {
1828 /* load the unicode data module */
1829 PyObject *m, *v;
1830 m = PyImport_ImportModule("unicodedata");
1831 if (m == NULL)
1832 goto ucnhashError;
1833 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1834 Py_DECREF(m);
1835 if (v == NULL)
1836 goto ucnhashError;
1837 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1838 Py_DECREF(v);
1839 if (ucnhash_CAPI == NULL)
1840 goto ucnhashError;
1841 }
1842 if (*s == '{') {
1843 const char *start = s+1;
1844 /* look for the closing brace */
1845 while (*s != '}' && s < end)
1846 s++;
1847 if (s > start && s < end && *s == '}') {
1848 /* found a name. look it up in the unicode database */
1849 message = "unknown Unicode character name";
1850 s++;
1851 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1852 goto store;
1853 }
1854 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001855 endinpos = s-starts;
1856 outpos = p-PyUnicode_AS_UNICODE(v);
1857 if (unicode_decode_call_errorhandler(
1858 errors, &errorHandler,
1859 "unicodeescape", message,
1860 starts, size, &startinpos, &endinpos, &exc, &s,
1861 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001862 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001863 break;
1864
1865 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001866 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 message = "\\ at end of string";
1868 s--;
1869 endinpos = s-starts;
1870 outpos = p-PyUnicode_AS_UNICODE(v);
1871 if (unicode_decode_call_errorhandler(
1872 errors, &errorHandler,
1873 "unicodeescape", message,
1874 starts, size, &startinpos, &endinpos, &exc, &s,
1875 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001876 goto onError;
1877 }
1878 else {
1879 *p++ = '\\';
1880 *p++ = (unsigned char)s[-1];
1881 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001882 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001884 nextByte:
1885 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001887 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001888 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001889 Py_XDECREF(errorHandler);
1890 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001891 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001892
Fredrik Lundhccc74732001-02-18 22:13:49 +00001893ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001894 PyErr_SetString(
1895 PyExc_UnicodeError,
1896 "\\N escapes not supported (can't load unicodedata module)"
1897 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001898 Py_XDECREF(errorHandler);
1899 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001900 return NULL;
1901
Fredrik Lundhccc74732001-02-18 22:13:49 +00001902onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001903 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001904 Py_XDECREF(errorHandler);
1905 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001906 return NULL;
1907}
1908
1909/* Return a Unicode-Escape string version of the Unicode object.
1910
1911 If quotes is true, the string is enclosed in u"" or u'' quotes as
1912 appropriate.
1913
1914*/
1915
Barry Warsaw51ac5802000-03-20 16:36:48 +00001916static const Py_UNICODE *findchar(const Py_UNICODE *s,
1917 int size,
1918 Py_UNICODE ch);
1919
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920static
1921PyObject *unicodeescape_string(const Py_UNICODE *s,
1922 int size,
1923 int quotes)
1924{
1925 PyObject *repr;
1926 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001928 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929
1930 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1931 if (repr == NULL)
1932 return NULL;
1933
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001934 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935
1936 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001938 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 !findchar(s, size, '"')) ? '"' : '\'';
1940 }
1941 while (size-- > 0) {
1942 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001943
Guido van Rossumd57fd912000-03-10 22:53:23 +00001944 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001945 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001946 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 *p++ = '\\';
1948 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001949 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001950 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001951
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001952#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001953 /* Map 21-bit characters to '\U00xxxxxx' */
1954 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001955 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00001956
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001957 /* Resize the string if necessary */
1958 if (offset + 12 > PyString_GET_SIZE(repr)) {
1959 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001960 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001961 p = PyString_AS_STRING(repr) + offset;
1962 }
1963
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001964 *p++ = '\\';
1965 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001966 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1967 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1968 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1969 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1970 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1971 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1972 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001973 *p++ = hexdigit[ch & 0x0000000F];
1974 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001975 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001976#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001977 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1978 else if (ch >= 0xD800 && ch < 0xDC00) {
1979 Py_UNICODE ch2;
1980 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00001981
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001982 ch2 = *s++;
1983 size--;
1984 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1985 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1986 *p++ = '\\';
1987 *p++ = 'U';
1988 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1989 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1990 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1991 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1992 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1993 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1994 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1995 *p++ = hexdigit[ucs & 0x0000000F];
1996 continue;
1997 }
1998 /* Fall through: isolated surrogates are copied as-is */
1999 s--;
2000 size++;
2001 }
2002
Guido van Rossumd57fd912000-03-10 22:53:23 +00002003 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002004 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 *p++ = '\\';
2006 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002007 *p++ = hexdigit[(ch >> 12) & 0x000F];
2008 *p++ = hexdigit[(ch >> 8) & 0x000F];
2009 *p++ = hexdigit[(ch >> 4) & 0x000F];
2010 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002012
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002013 /* Map special whitespace to '\t', \n', '\r' */
2014 else if (ch == '\t') {
2015 *p++ = '\\';
2016 *p++ = 't';
2017 }
2018 else if (ch == '\n') {
2019 *p++ = '\\';
2020 *p++ = 'n';
2021 }
2022 else if (ch == '\r') {
2023 *p++ = '\\';
2024 *p++ = 'r';
2025 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002026
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002027 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002028 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002029 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002030 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002031 *p++ = hexdigit[(ch >> 4) & 0x000F];
2032 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002033 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002034
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 /* Copy everything else as-is */
2036 else
2037 *p++ = (char) ch;
2038 }
2039 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002040 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041
2042 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002043 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 return repr;
2045}
2046
2047PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2048 int size)
2049{
2050 return unicodeescape_string(s, size, 0);
2051}
2052
2053PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2054{
2055 if (!PyUnicode_Check(unicode)) {
2056 PyErr_BadArgument();
2057 return NULL;
2058 }
2059 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2060 PyUnicode_GET_SIZE(unicode));
2061}
2062
2063/* --- Raw Unicode Escape Codec ------------------------------------------- */
2064
2065PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2066 int size,
2067 const char *errors)
2068{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002069 const char *starts = s;
2070 int startinpos;
2071 int endinpos;
2072 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002074 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075 const char *end;
2076 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002077 PyObject *errorHandler = NULL;
2078 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002079
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080 /* Escaped strings will always be longer than the resulting
2081 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002082 length after conversion to the true value. (But decoding error
2083 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084 v = _PyUnicode_New(size);
2085 if (v == NULL)
2086 goto onError;
2087 if (size == 0)
2088 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002089 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002090 end = s + size;
2091 while (s < end) {
2092 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002093 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002095 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096
2097 /* Non-escape characters are interpreted as Unicode ordinals */
2098 if (*s != '\\') {
2099 *p++ = (unsigned char)*s++;
2100 continue;
2101 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002102 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002103
2104 /* \u-escapes are only interpreted iff the number of leading
2105 backslashes if odd */
2106 bs = s;
2107 for (;s < end;) {
2108 if (*s != '\\')
2109 break;
2110 *p++ = (unsigned char)*s++;
2111 }
2112 if (((s - bs) & 1) == 0 ||
2113 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002114 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002115 continue;
2116 }
2117 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002118 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002119 s++;
2120
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002121 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002122 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002123 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002124 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002126 endinpos = s-starts;
2127 if (unicode_decode_call_errorhandler(
2128 errors, &errorHandler,
2129 "rawunicodeescape", "truncated \\uXXXX",
2130 starts, size, &startinpos, &endinpos, &exc, &s,
2131 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002132 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002133 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134 }
2135 x = (x<<4) & ~0xF;
2136 if (c >= '0' && c <= '9')
2137 x += c - '0';
2138 else if (c >= 'a' && c <= 'f')
2139 x += 10 + c - 'a';
2140 else
2141 x += 10 + c - 'A';
2142 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002143#ifndef Py_UNICODE_WIDE
2144 if (x > 0x10000) {
2145 if (unicode_decode_call_errorhandler(
2146 errors, &errorHandler,
2147 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2148 starts, size, &startinpos, &endinpos, &exc, &s,
2149 (PyObject **)&v, &outpos, &p))
2150 goto onError;
2151 }
2152#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002153 *p++ = x;
2154 nextByte:
2155 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002157 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002158 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002159 Py_XDECREF(errorHandler);
2160 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002162
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163 onError:
2164 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002165 Py_XDECREF(errorHandler);
2166 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167 return NULL;
2168}
2169
2170PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2171 int size)
2172{
2173 PyObject *repr;
2174 char *p;
2175 char *q;
2176
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002177 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002178
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002179#ifdef Py_UNICODE_WIDE
2180 repr = PyString_FromStringAndSize(NULL, 10 * size);
2181#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002183#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002184 if (repr == NULL)
2185 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002186 if (size == 0)
2187 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188
2189 p = q = PyString_AS_STRING(repr);
2190 while (size-- > 0) {
2191 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002192#ifdef Py_UNICODE_WIDE
2193 /* Map 32-bit characters to '\Uxxxxxxxx' */
2194 if (ch >= 0x10000) {
2195 *p++ = '\\';
2196 *p++ = 'U';
2197 *p++ = hexdigit[(ch >> 28) & 0xf];
2198 *p++ = hexdigit[(ch >> 24) & 0xf];
2199 *p++ = hexdigit[(ch >> 20) & 0xf];
2200 *p++ = hexdigit[(ch >> 16) & 0xf];
2201 *p++ = hexdigit[(ch >> 12) & 0xf];
2202 *p++ = hexdigit[(ch >> 8) & 0xf];
2203 *p++ = hexdigit[(ch >> 4) & 0xf];
2204 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002205 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002206 else
2207#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002208 /* Map 16-bit characters to '\uxxxx' */
2209 if (ch >= 256) {
2210 *p++ = '\\';
2211 *p++ = 'u';
2212 *p++ = hexdigit[(ch >> 12) & 0xf];
2213 *p++ = hexdigit[(ch >> 8) & 0xf];
2214 *p++ = hexdigit[(ch >> 4) & 0xf];
2215 *p++ = hexdigit[ch & 15];
2216 }
2217 /* Copy everything else as-is */
2218 else
2219 *p++ = (char) ch;
2220 }
2221 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002222 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223 return repr;
2224}
2225
2226PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2227{
2228 if (!PyUnicode_Check(unicode)) {
2229 PyErr_BadArgument();
2230 return NULL;
2231 }
2232 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2233 PyUnicode_GET_SIZE(unicode));
2234}
2235
2236/* --- Latin-1 Codec ------------------------------------------------------ */
2237
2238PyObject *PyUnicode_DecodeLatin1(const char *s,
2239 int size,
2240 const char *errors)
2241{
2242 PyUnicodeObject *v;
2243 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002244
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002246 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002247 Py_UNICODE r = *(unsigned char*)s;
2248 return PyUnicode_FromUnicode(&r, 1);
2249 }
2250
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251 v = _PyUnicode_New(size);
2252 if (v == NULL)
2253 goto onError;
2254 if (size == 0)
2255 return (PyObject *)v;
2256 p = PyUnicode_AS_UNICODE(v);
2257 while (size-- > 0)
2258 *p++ = (unsigned char)*s++;
2259 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002260
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 onError:
2262 Py_XDECREF(v);
2263 return NULL;
2264}
2265
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266/* create or adjust a UnicodeEncodeError */
2267static void make_encode_exception(PyObject **exceptionObject,
2268 const char *encoding,
2269 const Py_UNICODE *unicode, int size,
2270 int startpos, int endpos,
2271 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002273 if (*exceptionObject == NULL) {
2274 *exceptionObject = PyUnicodeEncodeError_Create(
2275 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 }
2277 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002278 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2279 goto onError;
2280 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2281 goto onError;
2282 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2283 goto onError;
2284 return;
2285 onError:
2286 Py_DECREF(*exceptionObject);
2287 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288 }
2289}
2290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002291/* raises a UnicodeEncodeError */
2292static void raise_encode_exception(PyObject **exceptionObject,
2293 const char *encoding,
2294 const Py_UNICODE *unicode, int size,
2295 int startpos, int endpos,
2296 const char *reason)
2297{
2298 make_encode_exception(exceptionObject,
2299 encoding, unicode, size, startpos, endpos, reason);
2300 if (*exceptionObject != NULL)
2301 PyCodec_StrictErrors(*exceptionObject);
2302}
2303
2304/* error handling callback helper:
2305 build arguments, call the callback and check the arguments,
2306 put the result into newpos and return the replacement string, which
2307 has to be freed by the caller */
2308static PyObject *unicode_encode_call_errorhandler(const char *errors,
2309 PyObject **errorHandler,
2310 const char *encoding, const char *reason,
2311 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2312 int startpos, int endpos,
2313 int *newpos)
2314{
2315 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2316
2317 PyObject *restuple;
2318 PyObject *resunicode;
2319
2320 if (*errorHandler == NULL) {
2321 *errorHandler = PyCodec_LookupError(errors);
2322 if (*errorHandler == NULL)
2323 return NULL;
2324 }
2325
2326 make_encode_exception(exceptionObject,
2327 encoding, unicode, size, startpos, endpos, reason);
2328 if (*exceptionObject == NULL)
2329 return NULL;
2330
2331 restuple = PyObject_CallFunctionObjArgs(
2332 *errorHandler, *exceptionObject, NULL);
2333 if (restuple == NULL)
2334 return NULL;
2335 if (!PyTuple_Check(restuple)) {
2336 PyErr_Format(PyExc_TypeError, &argparse[4]);
2337 Py_DECREF(restuple);
2338 return NULL;
2339 }
2340 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2341 &resunicode, newpos)) {
2342 Py_DECREF(restuple);
2343 return NULL;
2344 }
2345 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002346 *newpos = size+*newpos;
2347 if (*newpos<0 || *newpos>size) {
2348 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2349 Py_DECREF(restuple);
2350 return NULL;
2351 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002352 Py_INCREF(resunicode);
2353 Py_DECREF(restuple);
2354 return resunicode;
2355}
2356
2357static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2358 int size,
2359 const char *errors,
2360 int limit)
2361{
2362 /* output object */
2363 PyObject *res;
2364 /* pointers to the beginning and end+1 of input */
2365 const Py_UNICODE *startp = p;
2366 const Py_UNICODE *endp = p + size;
2367 /* pointer to the beginning of the unencodable characters */
2368 /* const Py_UNICODE *badp = NULL; */
2369 /* pointer into the output */
2370 char *str;
2371 /* current output position */
2372 int respos = 0;
2373 int ressize;
2374 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2375 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2376 PyObject *errorHandler = NULL;
2377 PyObject *exc = NULL;
2378 /* the following variable is used for caching string comparisons
2379 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2380 int known_errorHandler = -1;
2381
2382 /* allocate enough for a simple encoding without
2383 replacements, if we need more, we'll resize */
2384 res = PyString_FromStringAndSize(NULL, size);
2385 if (res == NULL)
2386 goto onError;
2387 if (size == 0)
2388 return res;
2389 str = PyString_AS_STRING(res);
2390 ressize = size;
2391
2392 while (p<endp) {
2393 Py_UNICODE c = *p;
2394
2395 /* can we encode this? */
2396 if (c<limit) {
2397 /* no overflow check, because we know that the space is enough */
2398 *str++ = (char)c;
2399 ++p;
2400 }
2401 else {
2402 int unicodepos = p-startp;
2403 int requiredsize;
2404 PyObject *repunicode;
2405 int repsize;
2406 int newpos;
2407 int respos;
2408 Py_UNICODE *uni2;
2409 /* startpos for collecting unencodable chars */
2410 const Py_UNICODE *collstart = p;
2411 const Py_UNICODE *collend = p;
2412 /* find all unecodable characters */
2413 while ((collend < endp) && ((*collend)>=limit))
2414 ++collend;
2415 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2416 if (known_errorHandler==-1) {
2417 if ((errors==NULL) || (!strcmp(errors, "strict")))
2418 known_errorHandler = 1;
2419 else if (!strcmp(errors, "replace"))
2420 known_errorHandler = 2;
2421 else if (!strcmp(errors, "ignore"))
2422 known_errorHandler = 3;
2423 else if (!strcmp(errors, "xmlcharrefreplace"))
2424 known_errorHandler = 4;
2425 else
2426 known_errorHandler = 0;
2427 }
2428 switch (known_errorHandler) {
2429 case 1: /* strict */
2430 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2431 goto onError;
2432 case 2: /* replace */
2433 while (collstart++<collend)
2434 *str++ = '?'; /* fall through */
2435 case 3: /* ignore */
2436 p = collend;
2437 break;
2438 case 4: /* xmlcharrefreplace */
2439 respos = str-PyString_AS_STRING(res);
2440 /* determine replacement size (temporarily (mis)uses p) */
2441 for (p = collstart, repsize = 0; p < collend; ++p) {
2442 if (*p<10)
2443 repsize += 2+1+1;
2444 else if (*p<100)
2445 repsize += 2+2+1;
2446 else if (*p<1000)
2447 repsize += 2+3+1;
2448 else if (*p<10000)
2449 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002450#ifndef Py_UNICODE_WIDE
2451 else
2452 repsize += 2+5+1;
2453#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002454 else if (*p<100000)
2455 repsize += 2+5+1;
2456 else if (*p<1000000)
2457 repsize += 2+6+1;
2458 else
2459 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002460#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002461 }
2462 requiredsize = respos+repsize+(endp-collend);
2463 if (requiredsize > ressize) {
2464 if (requiredsize<2*ressize)
2465 requiredsize = 2*ressize;
2466 if (_PyString_Resize(&res, requiredsize))
2467 goto onError;
2468 str = PyString_AS_STRING(res) + respos;
2469 ressize = requiredsize;
2470 }
2471 /* generate replacement (temporarily (mis)uses p) */
2472 for (p = collstart; p < collend; ++p) {
2473 str += sprintf(str, "&#%d;", (int)*p);
2474 }
2475 p = collend;
2476 break;
2477 default:
2478 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2479 encoding, reason, startp, size, &exc,
2480 collstart-startp, collend-startp, &newpos);
2481 if (repunicode == NULL)
2482 goto onError;
2483 /* need more space? (at least enough for what we
2484 have+the replacement+the rest of the string, so
2485 we won't have to check space for encodable characters) */
2486 respos = str-PyString_AS_STRING(res);
2487 repsize = PyUnicode_GET_SIZE(repunicode);
2488 requiredsize = respos+repsize+(endp-collend);
2489 if (requiredsize > ressize) {
2490 if (requiredsize<2*ressize)
2491 requiredsize = 2*ressize;
2492 if (_PyString_Resize(&res, requiredsize)) {
2493 Py_DECREF(repunicode);
2494 goto onError;
2495 }
2496 str = PyString_AS_STRING(res) + respos;
2497 ressize = requiredsize;
2498 }
2499 /* check if there is anything unencodable in the replacement
2500 and copy it to the output */
2501 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2502 c = *uni2;
2503 if (c >= limit) {
2504 raise_encode_exception(&exc, encoding, startp, size,
2505 unicodepos, unicodepos+1, reason);
2506 Py_DECREF(repunicode);
2507 goto onError;
2508 }
2509 *str = (char)c;
2510 }
2511 p = startp + newpos;
2512 Py_DECREF(repunicode);
2513 }
2514 }
2515 }
2516 /* Resize if we allocated to much */
2517 respos = str-PyString_AS_STRING(res);
2518 if (respos<ressize)
2519 /* If this falls res will be NULL */
2520 _PyString_Resize(&res, respos);
2521 Py_XDECREF(errorHandler);
2522 Py_XDECREF(exc);
2523 return res;
2524
2525 onError:
2526 Py_XDECREF(res);
2527 Py_XDECREF(errorHandler);
2528 Py_XDECREF(exc);
2529 return NULL;
2530}
2531
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2533 int size,
2534 const char *errors)
2535{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002536 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537}
2538
2539PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2540{
2541 if (!PyUnicode_Check(unicode)) {
2542 PyErr_BadArgument();
2543 return NULL;
2544 }
2545 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2546 PyUnicode_GET_SIZE(unicode),
2547 NULL);
2548}
2549
2550/* --- 7-bit ASCII Codec -------------------------------------------------- */
2551
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552PyObject *PyUnicode_DecodeASCII(const char *s,
2553 int size,
2554 const char *errors)
2555{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002556 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 PyUnicodeObject *v;
2558 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002559 int startinpos;
2560 int endinpos;
2561 int outpos;
2562 const char *e;
2563 PyObject *errorHandler = NULL;
2564 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002565
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002567 if (size == 1 && *(unsigned char*)s < 128) {
2568 Py_UNICODE r = *(unsigned char*)s;
2569 return PyUnicode_FromUnicode(&r, 1);
2570 }
Tim Petersced69f82003-09-16 20:30:58 +00002571
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572 v = _PyUnicode_New(size);
2573 if (v == NULL)
2574 goto onError;
2575 if (size == 0)
2576 return (PyObject *)v;
2577 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002578 e = s + size;
2579 while (s < e) {
2580 register unsigned char c = (unsigned char)*s;
2581 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002583 ++s;
2584 }
2585 else {
2586 startinpos = s-starts;
2587 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002588 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002589 if (unicode_decode_call_errorhandler(
2590 errors, &errorHandler,
2591 "ascii", "ordinal not in range(128)",
2592 starts, size, &startinpos, &endinpos, &exc, &s,
2593 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002597 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002598 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002599 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002600 Py_XDECREF(errorHandler);
2601 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002603
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 onError:
2605 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002606 Py_XDECREF(errorHandler);
2607 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608 return NULL;
2609}
2610
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2612 int size,
2613 const char *errors)
2614{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002615 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616}
2617
2618PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2619{
2620 if (!PyUnicode_Check(unicode)) {
2621 PyErr_BadArgument();
2622 return NULL;
2623 }
2624 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2625 PyUnicode_GET_SIZE(unicode),
2626 NULL);
2627}
2628
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002629#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002630
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002631/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002632
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002633PyObject *PyUnicode_DecodeMBCS(const char *s,
2634 int size,
2635 const char *errors)
2636{
2637 PyUnicodeObject *v;
2638 Py_UNICODE *p;
2639
2640 /* First get the size of the result */
2641 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002642 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002643 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2644
2645 v = _PyUnicode_New(usize);
2646 if (v == NULL)
2647 return NULL;
2648 if (usize == 0)
2649 return (PyObject *)v;
2650 p = PyUnicode_AS_UNICODE(v);
2651 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2652 Py_DECREF(v);
2653 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2654 }
2655
2656 return (PyObject *)v;
2657}
2658
2659PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2660 int size,
2661 const char *errors)
2662{
2663 PyObject *repr;
2664 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002665 DWORD mbcssize;
2666
2667 /* If there are no characters, bail now! */
2668 if (size==0)
2669 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002670
2671 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002672 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002673 if (mbcssize==0)
2674 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2675
2676 repr = PyString_FromStringAndSize(NULL, mbcssize);
2677 if (repr == NULL)
2678 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002679 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002680 return repr;
2681
2682 /* Do the conversion */
2683 s = PyString_AS_STRING(repr);
2684 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2685 Py_DECREF(repr);
2686 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2687 }
2688 return repr;
2689}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002690
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002691PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2692{
2693 if (!PyUnicode_Check(unicode)) {
2694 PyErr_BadArgument();
2695 return NULL;
2696 }
2697 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2698 PyUnicode_GET_SIZE(unicode),
2699 NULL);
2700}
2701
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002702#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002703
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704/* --- Character Mapping Codec -------------------------------------------- */
2705
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706PyObject *PyUnicode_DecodeCharmap(const char *s,
2707 int size,
2708 PyObject *mapping,
2709 const char *errors)
2710{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002711 const char *starts = s;
2712 int startinpos;
2713 int endinpos;
2714 int outpos;
2715 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716 PyUnicodeObject *v;
2717 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002718 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002719 PyObject *errorHandler = NULL;
2720 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002721
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722 /* Default to Latin-1 */
2723 if (mapping == NULL)
2724 return PyUnicode_DecodeLatin1(s, size, errors);
2725
2726 v = _PyUnicode_New(size);
2727 if (v == NULL)
2728 goto onError;
2729 if (size == 0)
2730 return (PyObject *)v;
2731 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002732 e = s + size;
2733 while (s < e) {
2734 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 PyObject *w, *x;
2736
2737 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2738 w = PyInt_FromLong((long)ch);
2739 if (w == NULL)
2740 goto onError;
2741 x = PyObject_GetItem(mapping, w);
2742 Py_DECREF(w);
2743 if (x == NULL) {
2744 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002745 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002747 x = Py_None;
2748 Py_INCREF(x);
2749 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002750 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 }
2752
2753 /* Apply mapping */
2754 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002755 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 if (value < 0 || value > 65535) {
2757 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002758 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 Py_DECREF(x);
2760 goto onError;
2761 }
2762 *p++ = (Py_UNICODE)value;
2763 }
2764 else if (x == Py_None) {
2765 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002766 outpos = p-PyUnicode_AS_UNICODE(v);
2767 startinpos = s-starts;
2768 endinpos = startinpos+1;
2769 if (unicode_decode_call_errorhandler(
2770 errors, &errorHandler,
2771 "charmap", "character maps to <undefined>",
2772 starts, size, &startinpos, &endinpos, &exc, &s,
2773 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774 Py_DECREF(x);
2775 goto onError;
2776 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002777 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778 }
2779 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002780 int targetsize = PyUnicode_GET_SIZE(x);
2781
2782 if (targetsize == 1)
2783 /* 1-1 mapping */
2784 *p++ = *PyUnicode_AS_UNICODE(x);
2785
2786 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002788 if (targetsize > extrachars) {
2789 /* resize first */
2790 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2791 int needed = (targetsize - extrachars) + \
2792 (targetsize << 2);
2793 extrachars += needed;
Tim Petersced69f82003-09-16 20:30:58 +00002794 if (_PyUnicode_Resize(&v,
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002795 PyUnicode_GET_SIZE(v) + needed) < 0) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002796 Py_DECREF(x);
2797 goto onError;
2798 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002799 p = PyUnicode_AS_UNICODE(v) + oldpos;
2800 }
2801 Py_UNICODE_COPY(p,
2802 PyUnicode_AS_UNICODE(x),
2803 targetsize);
2804 p += targetsize;
2805 extrachars -= targetsize;
2806 }
2807 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 }
2809 else {
2810 /* wrong return value */
2811 PyErr_SetString(PyExc_TypeError,
2812 "character mapping must return integer, None or unicode");
2813 Py_DECREF(x);
2814 goto onError;
2815 }
2816 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002817 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818 }
2819 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002820 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822 Py_XDECREF(errorHandler);
2823 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002825
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002827 Py_XDECREF(errorHandler);
2828 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829 Py_XDECREF(v);
2830 return NULL;
2831}
2832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002833/* Lookup the character ch in the mapping. If the character
2834 can't be found, Py_None is returned (or NULL, if another
2835 error occured). */
2836static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002838 PyObject *w = PyInt_FromLong((long)c);
2839 PyObject *x;
2840
2841 if (w == NULL)
2842 return NULL;
2843 x = PyObject_GetItem(mapping, w);
2844 Py_DECREF(w);
2845 if (x == NULL) {
2846 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2847 /* No mapping found means: mapping is undefined. */
2848 PyErr_Clear();
2849 x = Py_None;
2850 Py_INCREF(x);
2851 return x;
2852 } else
2853 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002855 else if (x == Py_None)
2856 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002857 else if (PyInt_Check(x)) {
2858 long value = PyInt_AS_LONG(x);
2859 if (value < 0 || value > 255) {
2860 PyErr_SetString(PyExc_TypeError,
2861 "character mapping must be in range(256)");
2862 Py_DECREF(x);
2863 return NULL;
2864 }
2865 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002867 else if (PyString_Check(x))
2868 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002870 /* wrong return value */
2871 PyErr_SetString(PyExc_TypeError,
2872 "character mapping must return integer, None or str");
2873 Py_DECREF(x);
2874 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 }
2876}
2877
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002878/* lookup the character, put the result in the output string and adjust
2879 various state variables. Reallocate the output string if not enough
2880 space is available. Return a new reference to the object that
2881 was put in the output buffer, or Py_None, if the mapping was undefined
2882 (in which case no character was written) or NULL, if a
2883 reallocation error ocurred. The called must decref the result */
2884static
2885PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2886 PyObject **outobj, int *outpos)
2887{
2888 PyObject *rep = charmapencode_lookup(c, mapping);
2889
2890 if (rep==NULL)
2891 return NULL;
2892 else if (rep==Py_None)
2893 return rep;
2894 else {
2895 char *outstart = PyString_AS_STRING(*outobj);
2896 int outsize = PyString_GET_SIZE(*outobj);
2897 if (PyInt_Check(rep)) {
2898 int requiredsize = *outpos+1;
2899 if (outsize<requiredsize) {
2900 /* exponentially overallocate to minimize reallocations */
2901 if (requiredsize < 2*outsize)
2902 requiredsize = 2*outsize;
2903 if (_PyString_Resize(outobj, requiredsize)) {
2904 Py_DECREF(rep);
2905 return NULL;
2906 }
2907 outstart = PyString_AS_STRING(*outobj);
2908 }
2909 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2910 }
2911 else {
2912 const char *repchars = PyString_AS_STRING(rep);
2913 int repsize = PyString_GET_SIZE(rep);
2914 int requiredsize = *outpos+repsize;
2915 if (outsize<requiredsize) {
2916 /* exponentially overallocate to minimize reallocations */
2917 if (requiredsize < 2*outsize)
2918 requiredsize = 2*outsize;
2919 if (_PyString_Resize(outobj, requiredsize)) {
2920 Py_DECREF(rep);
2921 return NULL;
2922 }
2923 outstart = PyString_AS_STRING(*outobj);
2924 }
2925 memcpy(outstart + *outpos, repchars, repsize);
2926 *outpos += repsize;
2927 }
2928 }
2929 return rep;
2930}
2931
2932/* handle an error in PyUnicode_EncodeCharmap
2933 Return 0 on success, -1 on error */
2934static
2935int charmap_encoding_error(
2936 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2937 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002938 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002939 PyObject **res, int *respos)
2940{
2941 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2942 int repsize;
2943 int newpos;
2944 Py_UNICODE *uni2;
2945 /* startpos for collecting unencodable chars */
2946 int collstartpos = *inpos;
2947 int collendpos = *inpos+1;
2948 int collpos;
2949 char *encoding = "charmap";
2950 char *reason = "character maps to <undefined>";
2951
2952 PyObject *x;
2953 /* find all unencodable characters */
2954 while (collendpos < size) {
2955 x = charmapencode_lookup(p[collendpos], mapping);
2956 if (x==NULL)
2957 return -1;
2958 else if (x!=Py_None) {
2959 Py_DECREF(x);
2960 break;
2961 }
2962 Py_DECREF(x);
2963 ++collendpos;
2964 }
2965 /* cache callback name lookup
2966 * (if not done yet, i.e. it's the first error) */
2967 if (*known_errorHandler==-1) {
2968 if ((errors==NULL) || (!strcmp(errors, "strict")))
2969 *known_errorHandler = 1;
2970 else if (!strcmp(errors, "replace"))
2971 *known_errorHandler = 2;
2972 else if (!strcmp(errors, "ignore"))
2973 *known_errorHandler = 3;
2974 else if (!strcmp(errors, "xmlcharrefreplace"))
2975 *known_errorHandler = 4;
2976 else
2977 *known_errorHandler = 0;
2978 }
2979 switch (*known_errorHandler) {
2980 case 1: /* strict */
2981 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2982 return -1;
2983 case 2: /* replace */
2984 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2985 x = charmapencode_output('?', mapping, res, respos);
2986 if (x==NULL) {
2987 return -1;
2988 }
2989 else if (x==Py_None) {
2990 Py_DECREF(x);
2991 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2992 return -1;
2993 }
2994 Py_DECREF(x);
2995 }
2996 /* fall through */
2997 case 3: /* ignore */
2998 *inpos = collendpos;
2999 break;
3000 case 4: /* xmlcharrefreplace */
3001 /* generate replacement (temporarily (mis)uses p) */
3002 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3003 char buffer[2+29+1+1];
3004 char *cp;
3005 sprintf(buffer, "&#%d;", (int)p[collpos]);
3006 for (cp = buffer; *cp; ++cp) {
3007 x = charmapencode_output(*cp, mapping, res, respos);
3008 if (x==NULL)
3009 return -1;
3010 else if (x==Py_None) {
3011 Py_DECREF(x);
3012 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3013 return -1;
3014 }
3015 Py_DECREF(x);
3016 }
3017 }
3018 *inpos = collendpos;
3019 break;
3020 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003021 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003022 encoding, reason, p, size, exceptionObject,
3023 collstartpos, collendpos, &newpos);
3024 if (repunicode == NULL)
3025 return -1;
3026 /* generate replacement */
3027 repsize = PyUnicode_GET_SIZE(repunicode);
3028 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3029 x = charmapencode_output(*uni2, mapping, res, respos);
3030 if (x==NULL) {
3031 Py_DECREF(repunicode);
3032 return -1;
3033 }
3034 else if (x==Py_None) {
3035 Py_DECREF(repunicode);
3036 Py_DECREF(x);
3037 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3038 return -1;
3039 }
3040 Py_DECREF(x);
3041 }
3042 *inpos = newpos;
3043 Py_DECREF(repunicode);
3044 }
3045 return 0;
3046}
3047
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3049 int size,
3050 PyObject *mapping,
3051 const char *errors)
3052{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 /* output object */
3054 PyObject *res = NULL;
3055 /* current input position */
3056 int inpos = 0;
3057 /* current output position */
3058 int respos = 0;
3059 PyObject *errorHandler = NULL;
3060 PyObject *exc = NULL;
3061 /* the following variable is used for caching string comparisons
3062 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3063 * 3=ignore, 4=xmlcharrefreplace */
3064 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065
3066 /* Default to Latin-1 */
3067 if (mapping == NULL)
3068 return PyUnicode_EncodeLatin1(p, size, errors);
3069
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070 /* allocate enough for a simple encoding without
3071 replacements, if we need more, we'll resize */
3072 res = PyString_FromStringAndSize(NULL, size);
3073 if (res == NULL)
3074 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003075 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003076 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003077
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003078 while (inpos<size) {
3079 /* try to encode it */
3080 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3081 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003083 if (x==Py_None) { /* unencodable character */
3084 if (charmap_encoding_error(p, size, &inpos, mapping,
3085 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003086 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003087 &res, &respos)) {
3088 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003089 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003092 else
3093 /* done with this character => adjust input position */
3094 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095 Py_DECREF(x);
3096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003098 /* Resize if we allocated to much */
3099 if (respos<PyString_GET_SIZE(res)) {
3100 if (_PyString_Resize(&res, respos))
3101 goto onError;
3102 }
3103 Py_XDECREF(exc);
3104 Py_XDECREF(errorHandler);
3105 return res;
3106
3107 onError:
3108 Py_XDECREF(res);
3109 Py_XDECREF(exc);
3110 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111 return NULL;
3112}
3113
3114PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3115 PyObject *mapping)
3116{
3117 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3118 PyErr_BadArgument();
3119 return NULL;
3120 }
3121 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3122 PyUnicode_GET_SIZE(unicode),
3123 mapping,
3124 NULL);
3125}
3126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003127/* create or adjust a UnicodeTranslateError */
3128static void make_translate_exception(PyObject **exceptionObject,
3129 const Py_UNICODE *unicode, int size,
3130 int startpos, int endpos,
3131 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003133 if (*exceptionObject == NULL) {
3134 *exceptionObject = PyUnicodeTranslateError_Create(
3135 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136 }
3137 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003138 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3139 goto onError;
3140 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3141 goto onError;
3142 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3143 goto onError;
3144 return;
3145 onError:
3146 Py_DECREF(*exceptionObject);
3147 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148 }
3149}
3150
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003151/* raises a UnicodeTranslateError */
3152static void raise_translate_exception(PyObject **exceptionObject,
3153 const Py_UNICODE *unicode, int size,
3154 int startpos, int endpos,
3155 const char *reason)
3156{
3157 make_translate_exception(exceptionObject,
3158 unicode, size, startpos, endpos, reason);
3159 if (*exceptionObject != NULL)
3160 PyCodec_StrictErrors(*exceptionObject);
3161}
3162
3163/* error handling callback helper:
3164 build arguments, call the callback and check the arguments,
3165 put the result into newpos and return the replacement string, which
3166 has to be freed by the caller */
3167static PyObject *unicode_translate_call_errorhandler(const char *errors,
3168 PyObject **errorHandler,
3169 const char *reason,
3170 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3171 int startpos, int endpos,
3172 int *newpos)
3173{
3174 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3175
3176 PyObject *restuple;
3177 PyObject *resunicode;
3178
3179 if (*errorHandler == NULL) {
3180 *errorHandler = PyCodec_LookupError(errors);
3181 if (*errorHandler == NULL)
3182 return NULL;
3183 }
3184
3185 make_translate_exception(exceptionObject,
3186 unicode, size, startpos, endpos, reason);
3187 if (*exceptionObject == NULL)
3188 return NULL;
3189
3190 restuple = PyObject_CallFunctionObjArgs(
3191 *errorHandler, *exceptionObject, NULL);
3192 if (restuple == NULL)
3193 return NULL;
3194 if (!PyTuple_Check(restuple)) {
3195 PyErr_Format(PyExc_TypeError, &argparse[4]);
3196 Py_DECREF(restuple);
3197 return NULL;
3198 }
3199 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3200 &resunicode, newpos)) {
3201 Py_DECREF(restuple);
3202 return NULL;
3203 }
3204 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003205 *newpos = size+*newpos;
3206 if (*newpos<0 || *newpos>size) {
3207 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3208 Py_DECREF(restuple);
3209 return NULL;
3210 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003211 Py_INCREF(resunicode);
3212 Py_DECREF(restuple);
3213 return resunicode;
3214}
3215
3216/* Lookup the character ch in the mapping and put the result in result,
3217 which must be decrefed by the caller.
3218 Return 0 on success, -1 on error */
3219static
3220int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3221{
3222 PyObject *w = PyInt_FromLong((long)c);
3223 PyObject *x;
3224
3225 if (w == NULL)
3226 return -1;
3227 x = PyObject_GetItem(mapping, w);
3228 Py_DECREF(w);
3229 if (x == NULL) {
3230 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3231 /* No mapping found means: use 1:1 mapping. */
3232 PyErr_Clear();
3233 *result = NULL;
3234 return 0;
3235 } else
3236 return -1;
3237 }
3238 else if (x == Py_None) {
3239 *result = x;
3240 return 0;
3241 }
3242 else if (PyInt_Check(x)) {
3243 long value = PyInt_AS_LONG(x);
3244 long max = PyUnicode_GetMax();
3245 if (value < 0 || value > max) {
3246 PyErr_Format(PyExc_TypeError,
3247 "character mapping must be in range(0x%lx)", max+1);
3248 Py_DECREF(x);
3249 return -1;
3250 }
3251 *result = x;
3252 return 0;
3253 }
3254 else if (PyUnicode_Check(x)) {
3255 *result = x;
3256 return 0;
3257 }
3258 else {
3259 /* wrong return value */
3260 PyErr_SetString(PyExc_TypeError,
3261 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003262 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003263 return -1;
3264 }
3265}
3266/* ensure that *outobj is at least requiredsize characters long,
3267if not reallocate and adjust various state variables.
3268Return 0 on success, -1 on error */
3269static
Walter Dörwald4894c302003-10-24 14:25:28 +00003270int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003271 int requiredsize)
3272{
Walter Dörwald4894c302003-10-24 14:25:28 +00003273 int oldsize = PyUnicode_GET_SIZE(*outobj);
3274 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003275 /* remember old output position */
3276 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3277 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003278 if (requiredsize < 2 * oldsize)
3279 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003280 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003281 return -1;
3282 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003283 }
3284 return 0;
3285}
3286/* lookup the character, put the result in the output string and adjust
3287 various state variables. Return a new reference to the object that
3288 was put in the output buffer in *result, or Py_None, if the mapping was
3289 undefined (in which case no character was written).
3290 The called must decref result.
3291 Return 0 on success, -1 on error. */
3292static
Walter Dörwald4894c302003-10-24 14:25:28 +00003293int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3294 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3295 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296{
Walter Dörwald4894c302003-10-24 14:25:28 +00003297 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003298 return -1;
3299 if (*res==NULL) {
3300 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003301 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003302 }
3303 else if (*res==Py_None)
3304 ;
3305 else if (PyInt_Check(*res)) {
3306 /* no overflow check, because we know that the space is enough */
3307 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3308 }
3309 else if (PyUnicode_Check(*res)) {
3310 int repsize = PyUnicode_GET_SIZE(*res);
3311 if (repsize==1) {
3312 /* no overflow check, because we know that the space is enough */
3313 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3314 }
3315 else if (repsize!=0) {
3316 /* more than one character */
Walter Dörwald4894c302003-10-24 14:25:28 +00003317 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003318 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003319 repsize - 1;
3320 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003321 return -1;
3322 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3323 *outp += repsize;
3324 }
3325 }
3326 else
3327 return -1;
3328 return 0;
3329}
3330
3331PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 int size,
3333 PyObject *mapping,
3334 const char *errors)
3335{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003336 /* output object */
3337 PyObject *res = NULL;
3338 /* pointers to the beginning and end+1 of input */
3339 const Py_UNICODE *startp = p;
3340 const Py_UNICODE *endp = p + size;
3341 /* pointer into the output */
3342 Py_UNICODE *str;
3343 /* current output position */
3344 int respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003345 char *reason = "character maps to <undefined>";
3346 PyObject *errorHandler = NULL;
3347 PyObject *exc = NULL;
3348 /* the following variable is used for caching string comparisons
3349 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3350 * 3=ignore, 4=xmlcharrefreplace */
3351 int known_errorHandler = -1;
3352
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 if (mapping == NULL) {
3354 PyErr_BadArgument();
3355 return NULL;
3356 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003357
3358 /* allocate enough for a simple 1:1 translation without
3359 replacements, if we need more, we'll resize */
3360 res = PyUnicode_FromUnicode(NULL, size);
3361 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003362 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003363 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003364 return res;
3365 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003367 while (p<endp) {
3368 /* try to encode it */
3369 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003370 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003371 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372 goto onError;
3373 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003374 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003375 if (x!=Py_None) /* it worked => adjust input pointer */
3376 ++p;
3377 else { /* untranslatable character */
3378 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3379 int repsize;
3380 int newpos;
3381 Py_UNICODE *uni2;
3382 /* startpos for collecting untranslatable chars */
3383 const Py_UNICODE *collstart = p;
3384 const Py_UNICODE *collend = p+1;
3385 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003386
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003387 /* find all untranslatable characters */
3388 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003389 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390 goto onError;
3391 Py_XDECREF(x);
3392 if (x!=Py_None)
3393 break;
3394 ++collend;
3395 }
3396 /* cache callback name lookup
3397 * (if not done yet, i.e. it's the first error) */
3398 if (known_errorHandler==-1) {
3399 if ((errors==NULL) || (!strcmp(errors, "strict")))
3400 known_errorHandler = 1;
3401 else if (!strcmp(errors, "replace"))
3402 known_errorHandler = 2;
3403 else if (!strcmp(errors, "ignore"))
3404 known_errorHandler = 3;
3405 else if (!strcmp(errors, "xmlcharrefreplace"))
3406 known_errorHandler = 4;
3407 else
3408 known_errorHandler = 0;
3409 }
3410 switch (known_errorHandler) {
3411 case 1: /* strict */
3412 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3413 goto onError;
3414 case 2: /* replace */
3415 /* No need to check for space, this is a 1:1 replacement */
3416 for (coll = collstart; coll<collend; ++coll)
3417 *str++ = '?';
3418 /* fall through */
3419 case 3: /* ignore */
3420 p = collend;
3421 break;
3422 case 4: /* xmlcharrefreplace */
3423 /* generate replacement (temporarily (mis)uses p) */
3424 for (p = collstart; p < collend; ++p) {
3425 char buffer[2+29+1+1];
3426 char *cp;
3427 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003428 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3430 goto onError;
3431 for (cp = buffer; *cp; ++cp)
3432 *str++ = *cp;
3433 }
3434 p = collend;
3435 break;
3436 default:
3437 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3438 reason, startp, size, &exc,
3439 collstart-startp, collend-startp, &newpos);
3440 if (repunicode == NULL)
3441 goto onError;
3442 /* generate replacement */
3443 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003444 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3446 Py_DECREF(repunicode);
3447 goto onError;
3448 }
3449 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3450 *str++ = *uni2;
3451 p = startp + newpos;
3452 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003453 }
3454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003456 /* Resize if we allocated to much */
3457 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003458 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003459 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003460 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003461 }
3462 Py_XDECREF(exc);
3463 Py_XDECREF(errorHandler);
3464 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466 onError:
3467 Py_XDECREF(res);
3468 Py_XDECREF(exc);
3469 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 return NULL;
3471}
3472
3473PyObject *PyUnicode_Translate(PyObject *str,
3474 PyObject *mapping,
3475 const char *errors)
3476{
3477 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003478
Guido van Rossumd57fd912000-03-10 22:53:23 +00003479 str = PyUnicode_FromObject(str);
3480 if (str == NULL)
3481 goto onError;
3482 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3483 PyUnicode_GET_SIZE(str),
3484 mapping,
3485 errors);
3486 Py_DECREF(str);
3487 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003488
Guido van Rossumd57fd912000-03-10 22:53:23 +00003489 onError:
3490 Py_XDECREF(str);
3491 return NULL;
3492}
Tim Petersced69f82003-09-16 20:30:58 +00003493
Guido van Rossum9e896b32000-04-05 20:11:21 +00003494/* --- Decimal Encoder ---------------------------------------------------- */
3495
3496int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3497 int length,
3498 char *output,
3499 const char *errors)
3500{
3501 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502 PyObject *errorHandler = NULL;
3503 PyObject *exc = NULL;
3504 const char *encoding = "decimal";
3505 const char *reason = "invalid decimal Unicode string";
3506 /* the following variable is used for caching string comparisons
3507 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3508 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003509
3510 if (output == NULL) {
3511 PyErr_BadArgument();
3512 return -1;
3513 }
3514
3515 p = s;
3516 end = s + length;
3517 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003519 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520 PyObject *repunicode;
3521 int repsize;
3522 int newpos;
3523 Py_UNICODE *uni2;
3524 Py_UNICODE *collstart;
3525 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003526
Guido van Rossum9e896b32000-04-05 20:11:21 +00003527 if (Py_UNICODE_ISSPACE(ch)) {
3528 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003530 continue;
3531 }
3532 decimal = Py_UNICODE_TODECIMAL(ch);
3533 if (decimal >= 0) {
3534 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003536 continue;
3537 }
Guido van Rossumba477042000-04-06 18:18:10 +00003538 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003539 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003540 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003541 continue;
3542 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 /* All other characters are considered unencodable */
3544 collstart = p;
3545 collend = p+1;
3546 while (collend < end) {
3547 if ((0 < *collend && *collend < 256) ||
3548 !Py_UNICODE_ISSPACE(*collend) ||
3549 Py_UNICODE_TODECIMAL(*collend))
3550 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003551 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003552 /* cache callback name lookup
3553 * (if not done yet, i.e. it's the first error) */
3554 if (known_errorHandler==-1) {
3555 if ((errors==NULL) || (!strcmp(errors, "strict")))
3556 known_errorHandler = 1;
3557 else if (!strcmp(errors, "replace"))
3558 known_errorHandler = 2;
3559 else if (!strcmp(errors, "ignore"))
3560 known_errorHandler = 3;
3561 else if (!strcmp(errors, "xmlcharrefreplace"))
3562 known_errorHandler = 4;
3563 else
3564 known_errorHandler = 0;
3565 }
3566 switch (known_errorHandler) {
3567 case 1: /* strict */
3568 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3569 goto onError;
3570 case 2: /* replace */
3571 for (p = collstart; p < collend; ++p)
3572 *output++ = '?';
3573 /* fall through */
3574 case 3: /* ignore */
3575 p = collend;
3576 break;
3577 case 4: /* xmlcharrefreplace */
3578 /* generate replacement (temporarily (mis)uses p) */
3579 for (p = collstart; p < collend; ++p)
3580 output += sprintf(output, "&#%d;", (int)*p);
3581 p = collend;
3582 break;
3583 default:
3584 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3585 encoding, reason, s, length, &exc,
3586 collstart-s, collend-s, &newpos);
3587 if (repunicode == NULL)
3588 goto onError;
3589 /* generate replacement */
3590 repsize = PyUnicode_GET_SIZE(repunicode);
3591 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3592 Py_UNICODE ch = *uni2;
3593 if (Py_UNICODE_ISSPACE(ch))
3594 *output++ = ' ';
3595 else {
3596 decimal = Py_UNICODE_TODECIMAL(ch);
3597 if (decimal >= 0)
3598 *output++ = '0' + decimal;
3599 else if (0 < ch && ch < 256)
3600 *output++ = (char)ch;
3601 else {
3602 Py_DECREF(repunicode);
3603 raise_encode_exception(&exc, encoding,
3604 s, length, collstart-s, collend-s, reason);
3605 goto onError;
3606 }
3607 }
3608 }
3609 p = s + newpos;
3610 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003611 }
3612 }
3613 /* 0-terminate the output string */
3614 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003615 Py_XDECREF(exc);
3616 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003617 return 0;
3618
3619 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 Py_XDECREF(exc);
3621 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003622 return -1;
3623}
3624
Guido van Rossumd57fd912000-03-10 22:53:23 +00003625/* --- Helpers ------------------------------------------------------------ */
3626
Tim Petersced69f82003-09-16 20:30:58 +00003627static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628int count(PyUnicodeObject *self,
3629 int start,
3630 int end,
3631 PyUnicodeObject *substring)
3632{
3633 int count = 0;
3634
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003635 if (start < 0)
3636 start += self->length;
3637 if (start < 0)
3638 start = 0;
3639 if (end > self->length)
3640 end = self->length;
3641 if (end < 0)
3642 end += self->length;
3643 if (end < 0)
3644 end = 0;
3645
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003646 if (substring->length == 0)
3647 return (end - start + 1);
3648
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649 end -= substring->length;
3650
3651 while (start <= end)
3652 if (Py_UNICODE_MATCH(self, start, substring)) {
3653 count++;
3654 start += substring->length;
3655 } else
3656 start++;
3657
3658 return count;
3659}
3660
3661int PyUnicode_Count(PyObject *str,
3662 PyObject *substr,
3663 int start,
3664 int end)
3665{
3666 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003667
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 str = PyUnicode_FromObject(str);
3669 if (str == NULL)
3670 return -1;
3671 substr = PyUnicode_FromObject(substr);
3672 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003673 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674 return -1;
3675 }
Tim Petersced69f82003-09-16 20:30:58 +00003676
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 result = count((PyUnicodeObject *)str,
3678 start, end,
3679 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003680
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681 Py_DECREF(str);
3682 Py_DECREF(substr);
3683 return result;
3684}
3685
Tim Petersced69f82003-09-16 20:30:58 +00003686static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687int findstring(PyUnicodeObject *self,
3688 PyUnicodeObject *substring,
3689 int start,
3690 int end,
3691 int direction)
3692{
3693 if (start < 0)
3694 start += self->length;
3695 if (start < 0)
3696 start = 0;
3697
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698 if (end > self->length)
3699 end = self->length;
3700 if (end < 0)
3701 end += self->length;
3702 if (end < 0)
3703 end = 0;
3704
Guido van Rossum76afbd92002-08-20 17:29:29 +00003705 if (substring->length == 0)
3706 return (direction > 0) ? start : end;
3707
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 end -= substring->length;
3709
3710 if (direction < 0) {
3711 for (; end >= start; end--)
3712 if (Py_UNICODE_MATCH(self, end, substring))
3713 return end;
3714 } else {
3715 for (; start <= end; start++)
3716 if (Py_UNICODE_MATCH(self, start, substring))
3717 return start;
3718 }
3719
3720 return -1;
3721}
3722
3723int PyUnicode_Find(PyObject *str,
3724 PyObject *substr,
3725 int start,
3726 int end,
3727 int direction)
3728{
3729 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003730
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 str = PyUnicode_FromObject(str);
3732 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003733 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003734 substr = PyUnicode_FromObject(substr);
3735 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003736 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003737 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738 }
Tim Petersced69f82003-09-16 20:30:58 +00003739
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740 result = findstring((PyUnicodeObject *)str,
3741 (PyUnicodeObject *)substr,
3742 start, end, direction);
3743 Py_DECREF(str);
3744 Py_DECREF(substr);
3745 return result;
3746}
3747
Tim Petersced69f82003-09-16 20:30:58 +00003748static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749int tailmatch(PyUnicodeObject *self,
3750 PyUnicodeObject *substring,
3751 int start,
3752 int end,
3753 int direction)
3754{
3755 if (start < 0)
3756 start += self->length;
3757 if (start < 0)
3758 start = 0;
3759
3760 if (substring->length == 0)
3761 return 1;
3762
3763 if (end > self->length)
3764 end = self->length;
3765 if (end < 0)
3766 end += self->length;
3767 if (end < 0)
3768 end = 0;
3769
3770 end -= substring->length;
3771 if (end < start)
3772 return 0;
3773
3774 if (direction > 0) {
3775 if (Py_UNICODE_MATCH(self, end, substring))
3776 return 1;
3777 } else {
3778 if (Py_UNICODE_MATCH(self, start, substring))
3779 return 1;
3780 }
3781
3782 return 0;
3783}
3784
3785int PyUnicode_Tailmatch(PyObject *str,
3786 PyObject *substr,
3787 int start,
3788 int end,
3789 int direction)
3790{
3791 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003792
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793 str = PyUnicode_FromObject(str);
3794 if (str == NULL)
3795 return -1;
3796 substr = PyUnicode_FromObject(substr);
3797 if (substr == NULL) {
3798 Py_DECREF(substr);
3799 return -1;
3800 }
Tim Petersced69f82003-09-16 20:30:58 +00003801
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802 result = tailmatch((PyUnicodeObject *)str,
3803 (PyUnicodeObject *)substr,
3804 start, end, direction);
3805 Py_DECREF(str);
3806 Py_DECREF(substr);
3807 return result;
3808}
3809
Tim Petersced69f82003-09-16 20:30:58 +00003810static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811const Py_UNICODE *findchar(const Py_UNICODE *s,
3812 int size,
3813 Py_UNICODE ch)
3814{
3815 /* like wcschr, but doesn't stop at NULL characters */
3816
3817 while (size-- > 0) {
3818 if (*s == ch)
3819 return s;
3820 s++;
3821 }
3822
3823 return NULL;
3824}
3825
3826/* Apply fixfct filter to the Unicode object self and return a
3827 reference to the modified object */
3828
Tim Petersced69f82003-09-16 20:30:58 +00003829static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830PyObject *fixup(PyUnicodeObject *self,
3831 int (*fixfct)(PyUnicodeObject *s))
3832{
3833
3834 PyUnicodeObject *u;
3835
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003836 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 if (u == NULL)
3838 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003839
3840 Py_UNICODE_COPY(u->str, self->str, self->length);
3841
Tim Peters7a29bd52001-09-12 03:03:31 +00003842 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843 /* fixfct should return TRUE if it modified the buffer. If
3844 FALSE, return a reference to the original buffer instead
3845 (to save space, not time) */
3846 Py_INCREF(self);
3847 Py_DECREF(u);
3848 return (PyObject*) self;
3849 }
3850 return (PyObject*) u;
3851}
3852
Tim Petersced69f82003-09-16 20:30:58 +00003853static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854int fixupper(PyUnicodeObject *self)
3855{
3856 int len = self->length;
3857 Py_UNICODE *s = self->str;
3858 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003859
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860 while (len-- > 0) {
3861 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003862
Guido van Rossumd57fd912000-03-10 22:53:23 +00003863 ch = Py_UNICODE_TOUPPER(*s);
3864 if (ch != *s) {
3865 status = 1;
3866 *s = ch;
3867 }
3868 s++;
3869 }
3870
3871 return status;
3872}
3873
Tim Petersced69f82003-09-16 20:30:58 +00003874static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875int fixlower(PyUnicodeObject *self)
3876{
3877 int len = self->length;
3878 Py_UNICODE *s = self->str;
3879 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003880
Guido van Rossumd57fd912000-03-10 22:53:23 +00003881 while (len-- > 0) {
3882 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003883
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884 ch = Py_UNICODE_TOLOWER(*s);
3885 if (ch != *s) {
3886 status = 1;
3887 *s = ch;
3888 }
3889 s++;
3890 }
3891
3892 return status;
3893}
3894
Tim Petersced69f82003-09-16 20:30:58 +00003895static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896int fixswapcase(PyUnicodeObject *self)
3897{
3898 int len = self->length;
3899 Py_UNICODE *s = self->str;
3900 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003901
Guido van Rossumd57fd912000-03-10 22:53:23 +00003902 while (len-- > 0) {
3903 if (Py_UNICODE_ISUPPER(*s)) {
3904 *s = Py_UNICODE_TOLOWER(*s);
3905 status = 1;
3906 } else if (Py_UNICODE_ISLOWER(*s)) {
3907 *s = Py_UNICODE_TOUPPER(*s);
3908 status = 1;
3909 }
3910 s++;
3911 }
3912
3913 return status;
3914}
3915
Tim Petersced69f82003-09-16 20:30:58 +00003916static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917int fixcapitalize(PyUnicodeObject *self)
3918{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003919 int len = self->length;
3920 Py_UNICODE *s = self->str;
3921 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003922
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003923 if (len == 0)
3924 return 0;
3925 if (Py_UNICODE_ISLOWER(*s)) {
3926 *s = Py_UNICODE_TOUPPER(*s);
3927 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003929 s++;
3930 while (--len > 0) {
3931 if (Py_UNICODE_ISUPPER(*s)) {
3932 *s = Py_UNICODE_TOLOWER(*s);
3933 status = 1;
3934 }
3935 s++;
3936 }
3937 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938}
3939
3940static
3941int fixtitle(PyUnicodeObject *self)
3942{
3943 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3944 register Py_UNICODE *e;
3945 int previous_is_cased;
3946
3947 /* Shortcut for single character strings */
3948 if (PyUnicode_GET_SIZE(self) == 1) {
3949 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3950 if (*p != ch) {
3951 *p = ch;
3952 return 1;
3953 }
3954 else
3955 return 0;
3956 }
Tim Petersced69f82003-09-16 20:30:58 +00003957
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 e = p + PyUnicode_GET_SIZE(self);
3959 previous_is_cased = 0;
3960 for (; p < e; p++) {
3961 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00003962
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963 if (previous_is_cased)
3964 *p = Py_UNICODE_TOLOWER(ch);
3965 else
3966 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00003967
3968 if (Py_UNICODE_ISLOWER(ch) ||
3969 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970 Py_UNICODE_ISTITLE(ch))
3971 previous_is_cased = 1;
3972 else
3973 previous_is_cased = 0;
3974 }
3975 return 1;
3976}
3977
Tim Peters8ce9f162004-08-27 01:49:32 +00003978PyObject *
3979PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980{
Tim Peters8ce9f162004-08-27 01:49:32 +00003981 PyObject *internal_separator = NULL;
Tim Peters05eba1f2004-08-27 21:32:02 +00003982 const Py_UNICODE *sep;
Tim Peters8ce9f162004-08-27 01:49:32 +00003983 size_t seplen;
Tim Peters05eba1f2004-08-27 21:32:02 +00003984 PyUnicodeObject *res = NULL; /* the result */
3985 size_t res_alloc = 100; /* # allocated bytes for string in res */
3986 size_t res_used; /* # used bytes */
3987 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
3988 PyObject *fseq; /* PySequence_Fast(seq) */
3989 int seqlen; /* len(fseq) -- number of items in sequence */
3990 const Py_UNICODE blank = ' ';
Tim Peters8ce9f162004-08-27 01:49:32 +00003991 PyObject *item;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992 int i;
3993
Tim Peters05eba1f2004-08-27 21:32:02 +00003994 fseq = PySequence_Fast(seq, "");
3995 if (fseq == NULL) {
3996 if (PyErr_ExceptionMatches(PyExc_TypeError))
3997 PyErr_Format(PyExc_TypeError,
3998 "sequence expected, %.80s found",
3999 seq->ob_type->tp_name);
4000 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004001 }
4002
Tim Peters91879ab2004-08-27 22:35:44 +00004003 /* Grrrr. A codec may be invoked to convert str objects to
4004 * Unicode, and so it's possible to call back into Python code
4005 * during PyUnicode_FromObject(), and so it's possible for a sick
4006 * codec to change the size of fseq (if seq is a list). Therefore
4007 * we have to keep refetching the size -- can't assume seqlen
4008 * is invariant.
4009 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004010 seqlen = PySequence_Fast_GET_SIZE(fseq);
4011 /* If empty sequence, return u"". */
4012 if (seqlen == 0) {
4013 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4014 goto Done;
4015 }
4016 /* If singleton sequence with an exact Unicode, return that. */
4017 if (seqlen == 1) {
4018 item = PySequence_Fast_GET_ITEM(fseq, 0);
4019 if (PyUnicode_CheckExact(item)) {
4020 Py_INCREF(item);
4021 res = (PyUnicodeObject *)item;
4022 goto Done;
4023 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004024 }
4025
Tim Peters05eba1f2004-08-27 21:32:02 +00004026 /* At least two items to join, or one that isn't exact Unicode. */
4027 if (seqlen > 1) {
4028 /* Set up sep and seplen -- they're needed. */
4029 if (separator == NULL) {
4030 sep = &blank;
4031 seplen = 1;
4032 }
4033 else {
4034 internal_separator = PyUnicode_FromObject(separator);
4035 if (internal_separator == NULL)
4036 goto onError;
4037 sep = PyUnicode_AS_UNICODE(internal_separator);
4038 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004039 /* In case PyUnicode_FromObject() mutated seq. */
4040 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004041 }
4042 }
4043
4044 /* Get space. */
4045 res = _PyUnicode_New((int)res_alloc);
4046 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004047 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004048 res_p = PyUnicode_AS_UNICODE(res);
4049 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004050
Tim Peters05eba1f2004-08-27 21:32:02 +00004051 for (i = 0; i < seqlen; ++i) {
4052 size_t itemlen;
4053 size_t new_res_used;
4054
4055 item = PySequence_Fast_GET_ITEM(fseq, i);
4056 /* Convert item to Unicode. */
4057 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4058 PyErr_Format(PyExc_TypeError,
4059 "sequence item %i: expected string or Unicode,"
4060 " %.80s found",
4061 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004062 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004063 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004064 item = PyUnicode_FromObject(item);
4065 if (item == NULL)
4066 goto onError;
4067 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004068
Tim Peters91879ab2004-08-27 22:35:44 +00004069 /* In case PyUnicode_FromObject() mutated seq. */
4070 seqlen = PySequence_Fast_GET_SIZE(fseq);
4071
Tim Peters8ce9f162004-08-27 01:49:32 +00004072 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004074 new_res_used = res_used + itemlen;
4075 if (new_res_used < res_used || new_res_used > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004076 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004077 if (i < seqlen - 1) {
4078 new_res_used += seplen;
4079 if (new_res_used < res_used || new_res_used > INT_MAX)
4080 goto Overflow;
4081 }
4082 if (new_res_used > res_alloc) {
4083 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004084 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004085 size_t oldsize = res_alloc;
4086 res_alloc += res_alloc;
4087 if (res_alloc < oldsize || res_alloc > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004088 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004089 } while (new_res_used > res_alloc);
4090 if (_PyUnicode_Resize(&res, (int)res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004091 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004093 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004094 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004096
4097 /* Copy item, and maybe the separator. */
4098 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), (int)itemlen);
4099 res_p += itemlen;
4100 if (i < seqlen - 1) {
4101 Py_UNICODE_COPY(res_p, sep, (int)seplen);
4102 res_p += seplen;
4103 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004105 res_used = new_res_used;
4106 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004107
Tim Peters05eba1f2004-08-27 21:32:02 +00004108 /* Shrink res to match the used area; this probably can't fail,
4109 * but it's cheap to check.
4110 */
4111 if (_PyUnicode_Resize(&res, (int)res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004112 goto onError;
4113
4114 Done:
4115 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004116 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117 return (PyObject *)res;
4118
Tim Peters8ce9f162004-08-27 01:49:32 +00004119 Overflow:
4120 PyErr_SetString(PyExc_OverflowError,
4121 "join() is too long for a Python string");
4122 Py_DECREF(item);
4123 /* fall through */
4124
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004126 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004127 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004128 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129 return NULL;
4130}
4131
Tim Petersced69f82003-09-16 20:30:58 +00004132static
4133PyUnicodeObject *pad(PyUnicodeObject *self,
4134 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135 int right,
4136 Py_UNICODE fill)
4137{
4138 PyUnicodeObject *u;
4139
4140 if (left < 0)
4141 left = 0;
4142 if (right < 0)
4143 right = 0;
4144
Tim Peters7a29bd52001-09-12 03:03:31 +00004145 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146 Py_INCREF(self);
4147 return self;
4148 }
4149
4150 u = _PyUnicode_New(left + self->length + right);
4151 if (u) {
4152 if (left)
4153 Py_UNICODE_FILL(u->str, fill, left);
4154 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4155 if (right)
4156 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4157 }
4158
4159 return u;
4160}
4161
4162#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004163 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004164 if (!str) \
4165 goto onError; \
4166 if (PyList_Append(list, str)) { \
4167 Py_DECREF(str); \
4168 goto onError; \
4169 } \
4170 else \
4171 Py_DECREF(str);
4172
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004173#define SPLIT_INSERT(data, left, right) \
4174 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4175 if (!str) \
4176 goto onError; \
4177 if (PyList_Insert(list, 0, str)) { \
4178 Py_DECREF(str); \
4179 goto onError; \
4180 } \
4181 else \
4182 Py_DECREF(str);
4183
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184static
4185PyObject *split_whitespace(PyUnicodeObject *self,
4186 PyObject *list,
4187 int maxcount)
4188{
4189 register int i;
4190 register int j;
4191 int len = self->length;
4192 PyObject *str;
4193
4194 for (i = j = 0; i < len; ) {
4195 /* find a token */
4196 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4197 i++;
4198 j = i;
4199 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4200 i++;
4201 if (j < i) {
4202 if (maxcount-- <= 0)
4203 break;
4204 SPLIT_APPEND(self->str, j, i);
4205 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4206 i++;
4207 j = i;
4208 }
4209 }
4210 if (j < len) {
4211 SPLIT_APPEND(self->str, j, len);
4212 }
4213 return list;
4214
4215 onError:
4216 Py_DECREF(list);
4217 return NULL;
4218}
4219
4220PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004221 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004222{
4223 register int i;
4224 register int j;
4225 int len;
4226 PyObject *list;
4227 PyObject *str;
4228 Py_UNICODE *data;
4229
4230 string = PyUnicode_FromObject(string);
4231 if (string == NULL)
4232 return NULL;
4233 data = PyUnicode_AS_UNICODE(string);
4234 len = PyUnicode_GET_SIZE(string);
4235
Guido van Rossumd57fd912000-03-10 22:53:23 +00004236 list = PyList_New(0);
4237 if (!list)
4238 goto onError;
4239
4240 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004241 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004242
Guido van Rossumd57fd912000-03-10 22:53:23 +00004243 /* Find a line and append it */
4244 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4245 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004246
4247 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004248 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249 if (i < len) {
4250 if (data[i] == '\r' && i + 1 < len &&
4251 data[i+1] == '\n')
4252 i += 2;
4253 else
4254 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004255 if (keepends)
4256 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257 }
Guido van Rossum86662912000-04-11 15:38:46 +00004258 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004259 j = i;
4260 }
4261 if (j < len) {
4262 SPLIT_APPEND(data, j, len);
4263 }
4264
4265 Py_DECREF(string);
4266 return list;
4267
4268 onError:
4269 Py_DECREF(list);
4270 Py_DECREF(string);
4271 return NULL;
4272}
4273
Tim Petersced69f82003-09-16 20:30:58 +00004274static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004275PyObject *split_char(PyUnicodeObject *self,
4276 PyObject *list,
4277 Py_UNICODE ch,
4278 int maxcount)
4279{
4280 register int i;
4281 register int j;
4282 int len = self->length;
4283 PyObject *str;
4284
4285 for (i = j = 0; i < len; ) {
4286 if (self->str[i] == ch) {
4287 if (maxcount-- <= 0)
4288 break;
4289 SPLIT_APPEND(self->str, j, i);
4290 i = j = i + 1;
4291 } else
4292 i++;
4293 }
4294 if (j <= len) {
4295 SPLIT_APPEND(self->str, j, len);
4296 }
4297 return list;
4298
4299 onError:
4300 Py_DECREF(list);
4301 return NULL;
4302}
4303
Tim Petersced69f82003-09-16 20:30:58 +00004304static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305PyObject *split_substring(PyUnicodeObject *self,
4306 PyObject *list,
4307 PyUnicodeObject *substring,
4308 int maxcount)
4309{
4310 register int i;
4311 register int j;
4312 int len = self->length;
4313 int sublen = substring->length;
4314 PyObject *str;
4315
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004316 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004317 if (Py_UNICODE_MATCH(self, i, substring)) {
4318 if (maxcount-- <= 0)
4319 break;
4320 SPLIT_APPEND(self->str, j, i);
4321 i = j = i + sublen;
4322 } else
4323 i++;
4324 }
4325 if (j <= len) {
4326 SPLIT_APPEND(self->str, j, len);
4327 }
4328 return list;
4329
4330 onError:
4331 Py_DECREF(list);
4332 return NULL;
4333}
4334
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004335static
4336PyObject *rsplit_whitespace(PyUnicodeObject *self,
4337 PyObject *list,
4338 int maxcount)
4339{
4340 register int i;
4341 register int j;
4342 int len = self->length;
4343 PyObject *str;
4344
4345 for (i = j = len - 1; i >= 0; ) {
4346 /* find a token */
4347 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4348 i--;
4349 j = i;
4350 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4351 i--;
4352 if (j > i) {
4353 if (maxcount-- <= 0)
4354 break;
4355 SPLIT_INSERT(self->str, i + 1, j + 1);
4356 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4357 i--;
4358 j = i;
4359 }
4360 }
4361 if (j >= 0) {
4362 SPLIT_INSERT(self->str, 0, j + 1);
4363 }
4364 return list;
4365
4366 onError:
4367 Py_DECREF(list);
4368 return NULL;
4369}
4370
4371static
4372PyObject *rsplit_char(PyUnicodeObject *self,
4373 PyObject *list,
4374 Py_UNICODE ch,
4375 int maxcount)
4376{
4377 register int i;
4378 register int j;
4379 int len = self->length;
4380 PyObject *str;
4381
4382 for (i = j = len - 1; i >= 0; ) {
4383 if (self->str[i] == ch) {
4384 if (maxcount-- <= 0)
4385 break;
4386 SPLIT_INSERT(self->str, i + 1, j + 1);
4387 j = i = i - 1;
4388 } else
4389 i--;
4390 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004391 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004392 SPLIT_INSERT(self->str, 0, j + 1);
4393 }
4394 return list;
4395
4396 onError:
4397 Py_DECREF(list);
4398 return NULL;
4399}
4400
4401static
4402PyObject *rsplit_substring(PyUnicodeObject *self,
4403 PyObject *list,
4404 PyUnicodeObject *substring,
4405 int maxcount)
4406{
4407 register int i;
4408 register int j;
4409 int len = self->length;
4410 int sublen = substring->length;
4411 PyObject *str;
4412
4413 for (i = len - sublen, j = len; i >= 0; ) {
4414 if (Py_UNICODE_MATCH(self, i, substring)) {
4415 if (maxcount-- <= 0)
4416 break;
4417 SPLIT_INSERT(self->str, i + sublen, j);
4418 j = i;
4419 i -= sublen;
4420 } else
4421 i--;
4422 }
4423 if (j >= 0) {
4424 SPLIT_INSERT(self->str, 0, j);
4425 }
4426 return list;
4427
4428 onError:
4429 Py_DECREF(list);
4430 return NULL;
4431}
4432
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004434#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435
4436static
4437PyObject *split(PyUnicodeObject *self,
4438 PyUnicodeObject *substring,
4439 int maxcount)
4440{
4441 PyObject *list;
4442
4443 if (maxcount < 0)
4444 maxcount = INT_MAX;
4445
4446 list = PyList_New(0);
4447 if (!list)
4448 return NULL;
4449
4450 if (substring == NULL)
4451 return split_whitespace(self,list,maxcount);
4452
4453 else if (substring->length == 1)
4454 return split_char(self,list,substring->str[0],maxcount);
4455
4456 else if (substring->length == 0) {
4457 Py_DECREF(list);
4458 PyErr_SetString(PyExc_ValueError, "empty separator");
4459 return NULL;
4460 }
4461 else
4462 return split_substring(self,list,substring,maxcount);
4463}
4464
Tim Petersced69f82003-09-16 20:30:58 +00004465static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004466PyObject *rsplit(PyUnicodeObject *self,
4467 PyUnicodeObject *substring,
4468 int maxcount)
4469{
4470 PyObject *list;
4471
4472 if (maxcount < 0)
4473 maxcount = INT_MAX;
4474
4475 list = PyList_New(0);
4476 if (!list)
4477 return NULL;
4478
4479 if (substring == NULL)
4480 return rsplit_whitespace(self,list,maxcount);
4481
4482 else if (substring->length == 1)
4483 return rsplit_char(self,list,substring->str[0],maxcount);
4484
4485 else if (substring->length == 0) {
4486 Py_DECREF(list);
4487 PyErr_SetString(PyExc_ValueError, "empty separator");
4488 return NULL;
4489 }
4490 else
4491 return rsplit_substring(self,list,substring,maxcount);
4492}
4493
4494static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495PyObject *replace(PyUnicodeObject *self,
4496 PyUnicodeObject *str1,
4497 PyUnicodeObject *str2,
4498 int maxcount)
4499{
4500 PyUnicodeObject *u;
4501
4502 if (maxcount < 0)
4503 maxcount = INT_MAX;
4504
4505 if (str1->length == 1 && str2->length == 1) {
4506 int i;
4507
4508 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004509 if (!findchar(self->str, self->length, str1->str[0]) &&
4510 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004511 /* nothing to replace, return original string */
4512 Py_INCREF(self);
4513 u = self;
4514 } else {
4515 Py_UNICODE u1 = str1->str[0];
4516 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004517
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004519 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520 self->length
4521 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004522 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004523 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004524 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525 for (i = 0; i < u->length; i++)
4526 if (u->str[i] == u1) {
4527 if (--maxcount < 0)
4528 break;
4529 u->str[i] = u2;
4530 }
4531 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004532 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004533
4534 } else {
4535 int n, i;
4536 Py_UNICODE *p;
4537
4538 /* replace strings */
4539 n = count(self, 0, self->length, str1);
4540 if (n > maxcount)
4541 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004542 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004544 if (PyUnicode_CheckExact(self)) {
4545 Py_INCREF(self);
4546 u = self;
4547 }
4548 else {
4549 u = (PyUnicodeObject *)
4550 PyUnicode_FromUnicode(self->str, self->length);
4551 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004552 } else {
4553 u = _PyUnicode_New(
4554 self->length + n * (str2->length - str1->length));
4555 if (u) {
4556 i = 0;
4557 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004558 if (str1->length > 0) {
4559 while (i <= self->length - str1->length)
4560 if (Py_UNICODE_MATCH(self, i, str1)) {
4561 /* replace string segment */
4562 Py_UNICODE_COPY(p, str2->str, str2->length);
4563 p += str2->length;
4564 i += str1->length;
4565 if (--n <= 0) {
4566 /* copy remaining part */
4567 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4568 break;
4569 }
4570 } else
4571 *p++ = self->str[i++];
4572 } else {
4573 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574 Py_UNICODE_COPY(p, str2->str, str2->length);
4575 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004576 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004577 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004579 }
4580 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582 }
4583 }
4584 }
Tim Petersced69f82003-09-16 20:30:58 +00004585
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586 return (PyObject *) u;
4587}
4588
4589/* --- Unicode Object Methods --------------------------------------------- */
4590
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004591PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592"S.title() -> unicode\n\
4593\n\
4594Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004595characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596
4597static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004598unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004600 return fixup(self, fixtitle);
4601}
4602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004603PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604"S.capitalize() -> unicode\n\
4605\n\
4606Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004607have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004608
4609static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004610unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612 return fixup(self, fixcapitalize);
4613}
4614
4615#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004616PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004617"S.capwords() -> unicode\n\
4618\n\
4619Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004620normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004621
4622static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004623unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624{
4625 PyObject *list;
4626 PyObject *item;
4627 int i;
4628
Guido van Rossumd57fd912000-03-10 22:53:23 +00004629 /* Split into words */
4630 list = split(self, NULL, -1);
4631 if (!list)
4632 return NULL;
4633
4634 /* Capitalize each word */
4635 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4636 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4637 fixcapitalize);
4638 if (item == NULL)
4639 goto onError;
4640 Py_DECREF(PyList_GET_ITEM(list, i));
4641 PyList_SET_ITEM(list, i, item);
4642 }
4643
4644 /* Join the words to form a new string */
4645 item = PyUnicode_Join(NULL, list);
4646
4647onError:
4648 Py_DECREF(list);
4649 return (PyObject *)item;
4650}
4651#endif
4652
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004653/* Argument converter. Coerces to a single unicode character */
4654
4655static int
4656convert_uc(PyObject *obj, void *addr)
4657{
4658 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4659 PyObject *uniobj;
4660 Py_UNICODE *unistr;
4661
4662 uniobj = PyUnicode_FromObject(obj);
4663 if (uniobj == NULL) {
4664 PyErr_SetString(PyExc_TypeError,
4665 "The fill character cannot be converted to Unicode");
4666 return 0;
4667 }
4668 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4669 PyErr_SetString(PyExc_TypeError,
4670 "The fill character must be exactly one character long");
4671 Py_DECREF(uniobj);
4672 return 0;
4673 }
4674 unistr = PyUnicode_AS_UNICODE(uniobj);
4675 *fillcharloc = unistr[0];
4676 Py_DECREF(uniobj);
4677 return 1;
4678}
4679
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004680PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004681"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004683Return S centered in a Unicode string of length width. Padding is\n\
4684done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685
4686static PyObject *
4687unicode_center(PyUnicodeObject *self, PyObject *args)
4688{
4689 int marg, left;
4690 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004691 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004693 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694 return NULL;
4695
Tim Peters7a29bd52001-09-12 03:03:31 +00004696 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697 Py_INCREF(self);
4698 return (PyObject*) self;
4699 }
4700
4701 marg = width - self->length;
4702 left = marg / 2 + (marg & width & 1);
4703
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004704 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705}
4706
Marc-André Lemburge5034372000-08-08 08:04:29 +00004707#if 0
4708
4709/* This code should go into some future Unicode collation support
4710 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004711 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004712
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004713/* speedy UTF-16 code point order comparison */
4714/* gleaned from: */
4715/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4716
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004717static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004718{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004719 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004720 0, 0, 0, 0, 0, 0, 0, 0,
4721 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004722 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004723};
4724
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725static int
4726unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4727{
4728 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004729
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730 Py_UNICODE *s1 = str1->str;
4731 Py_UNICODE *s2 = str2->str;
4732
4733 len1 = str1->length;
4734 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004735
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004737 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004738
4739 c1 = *s1++;
4740 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004741
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004742 if (c1 > (1<<11) * 26)
4743 c1 += utf16Fixup[c1>>11];
4744 if (c2 > (1<<11) * 26)
4745 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004746 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004747
4748 if (c1 != c2)
4749 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004750
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004751 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752 }
4753
4754 return (len1 < len2) ? -1 : (len1 != len2);
4755}
4756
Marc-André Lemburge5034372000-08-08 08:04:29 +00004757#else
4758
4759static int
4760unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4761{
4762 register int len1, len2;
4763
4764 Py_UNICODE *s1 = str1->str;
4765 Py_UNICODE *s2 = str2->str;
4766
4767 len1 = str1->length;
4768 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004769
Marc-André Lemburge5034372000-08-08 08:04:29 +00004770 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004771 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004772
Fredrik Lundh45714e92001-06-26 16:39:36 +00004773 c1 = *s1++;
4774 c2 = *s2++;
4775
4776 if (c1 != c2)
4777 return (c1 < c2) ? -1 : 1;
4778
Marc-André Lemburge5034372000-08-08 08:04:29 +00004779 len1--; len2--;
4780 }
4781
4782 return (len1 < len2) ? -1 : (len1 != len2);
4783}
4784
4785#endif
4786
Guido van Rossumd57fd912000-03-10 22:53:23 +00004787int PyUnicode_Compare(PyObject *left,
4788 PyObject *right)
4789{
4790 PyUnicodeObject *u = NULL, *v = NULL;
4791 int result;
4792
4793 /* Coerce the two arguments */
4794 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4795 if (u == NULL)
4796 goto onError;
4797 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4798 if (v == NULL)
4799 goto onError;
4800
Thomas Wouters7e474022000-07-16 12:04:32 +00004801 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802 if (v == u) {
4803 Py_DECREF(u);
4804 Py_DECREF(v);
4805 return 0;
4806 }
4807
4808 result = unicode_compare(u, v);
4809
4810 Py_DECREF(u);
4811 Py_DECREF(v);
4812 return result;
4813
4814onError:
4815 Py_XDECREF(u);
4816 Py_XDECREF(v);
4817 return -1;
4818}
4819
Guido van Rossum403d68b2000-03-13 15:55:09 +00004820int PyUnicode_Contains(PyObject *container,
4821 PyObject *element)
4822{
4823 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004824 int result, size;
4825 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004826
4827 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004828 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004829 if (v == NULL) {
4830 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004831 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004832 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004833 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004834 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004835 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004836 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004837
Barry Warsaw817918c2002-08-06 16:58:21 +00004838 size = PyUnicode_GET_SIZE(v);
4839 rhs = PyUnicode_AS_UNICODE(v);
4840 lhs = PyUnicode_AS_UNICODE(u);
4841
Guido van Rossum403d68b2000-03-13 15:55:09 +00004842 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004843 if (size == 1) {
4844 end = lhs + PyUnicode_GET_SIZE(u);
4845 while (lhs < end) {
4846 if (*lhs++ == *rhs) {
4847 result = 1;
4848 break;
4849 }
4850 }
4851 }
4852 else {
4853 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4854 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004855 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004856 result = 1;
4857 break;
4858 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004859 }
4860 }
4861
4862 Py_DECREF(u);
4863 Py_DECREF(v);
4864 return result;
4865
4866onError:
4867 Py_XDECREF(u);
4868 Py_XDECREF(v);
4869 return -1;
4870}
4871
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872/* Concat to string or Unicode object giving a new Unicode object. */
4873
4874PyObject *PyUnicode_Concat(PyObject *left,
4875 PyObject *right)
4876{
4877 PyUnicodeObject *u = NULL, *v = NULL, *w;
4878
4879 /* Coerce the two arguments */
4880 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4881 if (u == NULL)
4882 goto onError;
4883 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4884 if (v == NULL)
4885 goto onError;
4886
4887 /* Shortcuts */
4888 if (v == unicode_empty) {
4889 Py_DECREF(v);
4890 return (PyObject *)u;
4891 }
4892 if (u == unicode_empty) {
4893 Py_DECREF(u);
4894 return (PyObject *)v;
4895 }
4896
4897 /* Concat the two Unicode strings */
4898 w = _PyUnicode_New(u->length + v->length);
4899 if (w == NULL)
4900 goto onError;
4901 Py_UNICODE_COPY(w->str, u->str, u->length);
4902 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4903
4904 Py_DECREF(u);
4905 Py_DECREF(v);
4906 return (PyObject *)w;
4907
4908onError:
4909 Py_XDECREF(u);
4910 Py_XDECREF(v);
4911 return NULL;
4912}
4913
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004914PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915"S.count(sub[, start[, end]]) -> int\n\
4916\n\
4917Return the number of occurrences of substring sub in Unicode string\n\
4918S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004919interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920
4921static PyObject *
4922unicode_count(PyUnicodeObject *self, PyObject *args)
4923{
4924 PyUnicodeObject *substring;
4925 int start = 0;
4926 int end = INT_MAX;
4927 PyObject *result;
4928
Guido van Rossumb8872e62000-05-09 14:14:27 +00004929 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4930 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931 return NULL;
4932
4933 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4934 (PyObject *)substring);
4935 if (substring == NULL)
4936 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004937
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 if (start < 0)
4939 start += self->length;
4940 if (start < 0)
4941 start = 0;
4942 if (end > self->length)
4943 end = self->length;
4944 if (end < 0)
4945 end += self->length;
4946 if (end < 0)
4947 end = 0;
4948
4949 result = PyInt_FromLong((long) count(self, start, end, substring));
4950
4951 Py_DECREF(substring);
4952 return result;
4953}
4954
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004955PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004956"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004957\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004958Encodes S using the codec registered for encoding. encoding defaults\n\
4959to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004960handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004961a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4962'xmlcharrefreplace' as well as any other name registered with\n\
4963codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964
4965static PyObject *
4966unicode_encode(PyUnicodeObject *self, PyObject *args)
4967{
4968 char *encoding = NULL;
4969 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004970 PyObject *v;
4971
Guido van Rossumd57fd912000-03-10 22:53:23 +00004972 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4973 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004974 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00004975 if (v == NULL)
4976 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004977 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
4978 PyErr_Format(PyExc_TypeError,
4979 "encoder did not return a string/unicode object "
4980 "(type=%.400s)",
4981 v->ob_type->tp_name);
4982 Py_DECREF(v);
4983 return NULL;
4984 }
4985 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00004986
4987 onError:
4988 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004989}
4990
4991PyDoc_STRVAR(decode__doc__,
4992"S.decode([encoding[,errors]]) -> string or unicode\n\
4993\n\
4994Decodes S using the codec registered for encoding. encoding defaults\n\
4995to the default encoding. errors may be given to set a different error\n\
4996handling scheme. Default is 'strict' meaning that encoding errors raise\n\
4997a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
4998as well as any other name registerd with codecs.register_error that is\n\
4999able to handle UnicodeDecodeErrors.");
5000
5001static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005002unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005003{
5004 char *encoding = NULL;
5005 char *errors = NULL;
5006 PyObject *v;
5007
5008 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5009 return NULL;
5010 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005011 if (v == NULL)
5012 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005013 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5014 PyErr_Format(PyExc_TypeError,
5015 "decoder did not return a string/unicode object "
5016 "(type=%.400s)",
5017 v->ob_type->tp_name);
5018 Py_DECREF(v);
5019 return NULL;
5020 }
5021 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005022
5023 onError:
5024 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005025}
5026
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005027PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028"S.expandtabs([tabsize]) -> unicode\n\
5029\n\
5030Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005031If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005032
5033static PyObject*
5034unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5035{
5036 Py_UNICODE *e;
5037 Py_UNICODE *p;
5038 Py_UNICODE *q;
5039 int i, j;
5040 PyUnicodeObject *u;
5041 int tabsize = 8;
5042
5043 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5044 return NULL;
5045
Thomas Wouters7e474022000-07-16 12:04:32 +00005046 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047 i = j = 0;
5048 e = self->str + self->length;
5049 for (p = self->str; p < e; p++)
5050 if (*p == '\t') {
5051 if (tabsize > 0)
5052 j += tabsize - (j % tabsize);
5053 }
5054 else {
5055 j++;
5056 if (*p == '\n' || *p == '\r') {
5057 i += j;
5058 j = 0;
5059 }
5060 }
5061
5062 /* Second pass: create output string and fill it */
5063 u = _PyUnicode_New(i + j);
5064 if (!u)
5065 return NULL;
5066
5067 j = 0;
5068 q = u->str;
5069
5070 for (p = self->str; p < e; p++)
5071 if (*p == '\t') {
5072 if (tabsize > 0) {
5073 i = tabsize - (j % tabsize);
5074 j += i;
5075 while (i--)
5076 *q++ = ' ';
5077 }
5078 }
5079 else {
5080 j++;
5081 *q++ = *p;
5082 if (*p == '\n' || *p == '\r')
5083 j = 0;
5084 }
5085
5086 return (PyObject*) u;
5087}
5088
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005089PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090"S.find(sub [,start [,end]]) -> int\n\
5091\n\
5092Return the lowest index in S where substring sub is found,\n\
5093such that sub is contained within s[start,end]. Optional\n\
5094arguments start and end are interpreted as in slice notation.\n\
5095\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005096Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097
5098static PyObject *
5099unicode_find(PyUnicodeObject *self, PyObject *args)
5100{
5101 PyUnicodeObject *substring;
5102 int start = 0;
5103 int end = INT_MAX;
5104 PyObject *result;
5105
Guido van Rossumb8872e62000-05-09 14:14:27 +00005106 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5107 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108 return NULL;
5109 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5110 (PyObject *)substring);
5111 if (substring == NULL)
5112 return NULL;
5113
5114 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
5115
5116 Py_DECREF(substring);
5117 return result;
5118}
5119
5120static PyObject *
5121unicode_getitem(PyUnicodeObject *self, int index)
5122{
5123 if (index < 0 || index >= self->length) {
5124 PyErr_SetString(PyExc_IndexError, "string index out of range");
5125 return NULL;
5126 }
5127
5128 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5129}
5130
5131static long
5132unicode_hash(PyUnicodeObject *self)
5133{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005134 /* Since Unicode objects compare equal to their ASCII string
5135 counterparts, they should use the individual character values
5136 as basis for their hash value. This is needed to assure that
5137 strings and Unicode objects behave in the same way as
5138 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139
Fredrik Lundhdde61642000-07-10 18:27:47 +00005140 register int len;
5141 register Py_UNICODE *p;
5142 register long x;
5143
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144 if (self->hash != -1)
5145 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005146 len = PyUnicode_GET_SIZE(self);
5147 p = PyUnicode_AS_UNICODE(self);
5148 x = *p << 7;
5149 while (--len >= 0)
5150 x = (1000003*x) ^ *p++;
5151 x ^= PyUnicode_GET_SIZE(self);
5152 if (x == -1)
5153 x = -2;
5154 self->hash = x;
5155 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156}
5157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005158PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159"S.index(sub [,start [,end]]) -> int\n\
5160\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005161Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162
5163static PyObject *
5164unicode_index(PyUnicodeObject *self, PyObject *args)
5165{
5166 int result;
5167 PyUnicodeObject *substring;
5168 int start = 0;
5169 int end = INT_MAX;
5170
Guido van Rossumb8872e62000-05-09 14:14:27 +00005171 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5172 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005174
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5176 (PyObject *)substring);
5177 if (substring == NULL)
5178 return NULL;
5179
5180 result = findstring(self, substring, start, end, 1);
5181
5182 Py_DECREF(substring);
5183 if (result < 0) {
5184 PyErr_SetString(PyExc_ValueError, "substring not found");
5185 return NULL;
5186 }
5187 return PyInt_FromLong(result);
5188}
5189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005190PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005191"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005193Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005194at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195
5196static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005197unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198{
5199 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5200 register const Py_UNICODE *e;
5201 int cased;
5202
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203 /* Shortcut for single character strings */
5204 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005205 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005207 /* Special case for empty strings */
5208 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005209 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005210
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211 e = p + PyUnicode_GET_SIZE(self);
5212 cased = 0;
5213 for (; p < e; p++) {
5214 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005215
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005217 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218 else if (!cased && Py_UNICODE_ISLOWER(ch))
5219 cased = 1;
5220 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005221 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222}
5223
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005224PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005225"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005227Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005228at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229
5230static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005231unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232{
5233 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5234 register const Py_UNICODE *e;
5235 int cased;
5236
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237 /* Shortcut for single character strings */
5238 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005239 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005241 /* Special case for empty strings */
5242 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005243 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005244
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245 e = p + PyUnicode_GET_SIZE(self);
5246 cased = 0;
5247 for (; p < e; p++) {
5248 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005249
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005251 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252 else if (!cased && Py_UNICODE_ISUPPER(ch))
5253 cased = 1;
5254 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005255 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256}
5257
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005258PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005259"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005261Return True if S is a titlecased string and there is at least one\n\
5262character in S, i.e. upper- and titlecase characters may only\n\
5263follow uncased characters and lowercase characters only cased ones.\n\
5264Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265
5266static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005267unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268{
5269 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5270 register const Py_UNICODE *e;
5271 int cased, previous_is_cased;
5272
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273 /* Shortcut for single character strings */
5274 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005275 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5276 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005278 /* Special case for empty strings */
5279 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005280 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005281
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 e = p + PyUnicode_GET_SIZE(self);
5283 cased = 0;
5284 previous_is_cased = 0;
5285 for (; p < e; p++) {
5286 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005287
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5289 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005290 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291 previous_is_cased = 1;
5292 cased = 1;
5293 }
5294 else if (Py_UNICODE_ISLOWER(ch)) {
5295 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005296 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 previous_is_cased = 1;
5298 cased = 1;
5299 }
5300 else
5301 previous_is_cased = 0;
5302 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005303 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304}
5305
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005306PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005307"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005309Return True if all characters in S are whitespace\n\
5310and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311
5312static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005313unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314{
5315 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5316 register const Py_UNICODE *e;
5317
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318 /* Shortcut for single character strings */
5319 if (PyUnicode_GET_SIZE(self) == 1 &&
5320 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005321 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005322
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005323 /* Special case for empty strings */
5324 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005325 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005326
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327 e = p + PyUnicode_GET_SIZE(self);
5328 for (; p < e; p++) {
5329 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005330 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005332 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333}
5334
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005335PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005336"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005337\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005338Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005339and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005340
5341static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005342unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005343{
5344 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5345 register const Py_UNICODE *e;
5346
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005347 /* Shortcut for single character strings */
5348 if (PyUnicode_GET_SIZE(self) == 1 &&
5349 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005350 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005351
5352 /* Special case for empty strings */
5353 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005354 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005355
5356 e = p + PyUnicode_GET_SIZE(self);
5357 for (; p < e; p++) {
5358 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005359 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005360 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005361 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005362}
5363
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005364PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005365"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005366\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005367Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005368and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005369
5370static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005371unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005372{
5373 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5374 register const Py_UNICODE *e;
5375
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005376 /* Shortcut for single character strings */
5377 if (PyUnicode_GET_SIZE(self) == 1 &&
5378 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005379 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005380
5381 /* Special case for empty strings */
5382 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005383 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005384
5385 e = p + PyUnicode_GET_SIZE(self);
5386 for (; p < e; p++) {
5387 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005388 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005389 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005390 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005391}
5392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005393PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005394"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005396Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005397False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005398
5399static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005400unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401{
5402 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5403 register const Py_UNICODE *e;
5404
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 /* Shortcut for single character strings */
5406 if (PyUnicode_GET_SIZE(self) == 1 &&
5407 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005408 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005410 /* Special case for empty strings */
5411 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005412 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005413
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414 e = p + PyUnicode_GET_SIZE(self);
5415 for (; p < e; p++) {
5416 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005417 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005419 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420}
5421
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005422PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005423"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005425Return True if all characters in S are digits\n\
5426and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427
5428static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005429unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430{
5431 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5432 register const Py_UNICODE *e;
5433
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434 /* Shortcut for single character strings */
5435 if (PyUnicode_GET_SIZE(self) == 1 &&
5436 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005437 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005439 /* Special case for empty strings */
5440 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005441 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005442
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 e = p + PyUnicode_GET_SIZE(self);
5444 for (; p < e; p++) {
5445 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005446 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005448 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449}
5450
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005451PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005452"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005454Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005455False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456
5457static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005458unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459{
5460 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5461 register const Py_UNICODE *e;
5462
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463 /* Shortcut for single character strings */
5464 if (PyUnicode_GET_SIZE(self) == 1 &&
5465 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005466 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005468 /* Special case for empty strings */
5469 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005470 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005471
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 e = p + PyUnicode_GET_SIZE(self);
5473 for (; p < e; p++) {
5474 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005475 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005477 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478}
5479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005480PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481"S.join(sequence) -> unicode\n\
5482\n\
5483Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005484sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485
5486static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005487unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005489 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490}
5491
5492static int
5493unicode_length(PyUnicodeObject *self)
5494{
5495 return self->length;
5496}
5497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005498PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005499"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500\n\
5501Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005502done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503
5504static PyObject *
5505unicode_ljust(PyUnicodeObject *self, PyObject *args)
5506{
5507 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005508 Py_UNICODE fillchar = ' ';
5509
5510 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 return NULL;
5512
Tim Peters7a29bd52001-09-12 03:03:31 +00005513 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514 Py_INCREF(self);
5515 return (PyObject*) self;
5516 }
5517
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005518 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519}
5520
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005521PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522"S.lower() -> unicode\n\
5523\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005524Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525
5526static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005527unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529 return fixup(self, fixlower);
5530}
5531
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005532#define LEFTSTRIP 0
5533#define RIGHTSTRIP 1
5534#define BOTHSTRIP 2
5535
5536/* Arrays indexed by above */
5537static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5538
5539#define STRIPNAME(i) (stripformat[i]+3)
5540
5541static const Py_UNICODE *
5542unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5543{
Tim Peters030a5ce2002-04-22 19:00:10 +00005544 size_t i;
5545 for (i = 0; i < n; ++i)
5546 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005547 return s+i;
5548 return NULL;
5549}
5550
5551/* externally visible for str.strip(unicode) */
5552PyObject *
5553_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5554{
5555 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5556 int len = PyUnicode_GET_SIZE(self);
5557 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5558 int seplen = PyUnicode_GET_SIZE(sepobj);
5559 int i, j;
5560
5561 i = 0;
5562 if (striptype != RIGHTSTRIP) {
5563 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5564 i++;
5565 }
5566 }
5567
5568 j = len;
5569 if (striptype != LEFTSTRIP) {
5570 do {
5571 j--;
5572 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5573 j++;
5574 }
5575
5576 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5577 Py_INCREF(self);
5578 return (PyObject*)self;
5579 }
5580 else
5581 return PyUnicode_FromUnicode(s+i, j-i);
5582}
5583
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584
5585static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005586do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005588 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5589 int len = PyUnicode_GET_SIZE(self), i, j;
5590
5591 i = 0;
5592 if (striptype != RIGHTSTRIP) {
5593 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5594 i++;
5595 }
5596 }
5597
5598 j = len;
5599 if (striptype != LEFTSTRIP) {
5600 do {
5601 j--;
5602 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5603 j++;
5604 }
5605
5606 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5607 Py_INCREF(self);
5608 return (PyObject*)self;
5609 }
5610 else
5611 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612}
5613
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005614
5615static PyObject *
5616do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5617{
5618 PyObject *sep = NULL;
5619
5620 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5621 return NULL;
5622
5623 if (sep != NULL && sep != Py_None) {
5624 if (PyUnicode_Check(sep))
5625 return _PyUnicode_XStrip(self, striptype, sep);
5626 else if (PyString_Check(sep)) {
5627 PyObject *res;
5628 sep = PyUnicode_FromObject(sep);
5629 if (sep==NULL)
5630 return NULL;
5631 res = _PyUnicode_XStrip(self, striptype, sep);
5632 Py_DECREF(sep);
5633 return res;
5634 }
5635 else {
5636 PyErr_Format(PyExc_TypeError,
5637 "%s arg must be None, unicode or str",
5638 STRIPNAME(striptype));
5639 return NULL;
5640 }
5641 }
5642
5643 return do_strip(self, striptype);
5644}
5645
5646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005647PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005648"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005649\n\
5650Return a copy of the string S with leading and trailing\n\
5651whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005652If chars is given and not None, remove characters in chars instead.\n\
5653If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005654
5655static PyObject *
5656unicode_strip(PyUnicodeObject *self, PyObject *args)
5657{
5658 if (PyTuple_GET_SIZE(args) == 0)
5659 return do_strip(self, BOTHSTRIP); /* Common case */
5660 else
5661 return do_argstrip(self, BOTHSTRIP, args);
5662}
5663
5664
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005665PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005666"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005667\n\
5668Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005669If chars is given and not None, remove characters in chars instead.\n\
5670If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005671
5672static PyObject *
5673unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5674{
5675 if (PyTuple_GET_SIZE(args) == 0)
5676 return do_strip(self, LEFTSTRIP); /* Common case */
5677 else
5678 return do_argstrip(self, LEFTSTRIP, args);
5679}
5680
5681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005682PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005683"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005684\n\
5685Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005686If chars is given and not None, remove characters in chars instead.\n\
5687If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005688
5689static PyObject *
5690unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5691{
5692 if (PyTuple_GET_SIZE(args) == 0)
5693 return do_strip(self, RIGHTSTRIP); /* Common case */
5694 else
5695 return do_argstrip(self, RIGHTSTRIP, args);
5696}
5697
5698
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699static PyObject*
5700unicode_repeat(PyUnicodeObject *str, int len)
5701{
5702 PyUnicodeObject *u;
5703 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005704 int nchars;
5705 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706
5707 if (len < 0)
5708 len = 0;
5709
Tim Peters7a29bd52001-09-12 03:03:31 +00005710 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 /* no repeat, return original string */
5712 Py_INCREF(str);
5713 return (PyObject*) str;
5714 }
Tim Peters8f422462000-09-09 06:13:41 +00005715
5716 /* ensure # of chars needed doesn't overflow int and # of bytes
5717 * needed doesn't overflow size_t
5718 */
5719 nchars = len * str->length;
5720 if (len && nchars / len != str->length) {
5721 PyErr_SetString(PyExc_OverflowError,
5722 "repeated string is too long");
5723 return NULL;
5724 }
5725 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5726 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5727 PyErr_SetString(PyExc_OverflowError,
5728 "repeated string is too long");
5729 return NULL;
5730 }
5731 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 if (!u)
5733 return NULL;
5734
5735 p = u->str;
5736
5737 while (len-- > 0) {
5738 Py_UNICODE_COPY(p, str->str, str->length);
5739 p += str->length;
5740 }
5741
5742 return (PyObject*) u;
5743}
5744
5745PyObject *PyUnicode_Replace(PyObject *obj,
5746 PyObject *subobj,
5747 PyObject *replobj,
5748 int maxcount)
5749{
5750 PyObject *self;
5751 PyObject *str1;
5752 PyObject *str2;
5753 PyObject *result;
5754
5755 self = PyUnicode_FromObject(obj);
5756 if (self == NULL)
5757 return NULL;
5758 str1 = PyUnicode_FromObject(subobj);
5759 if (str1 == NULL) {
5760 Py_DECREF(self);
5761 return NULL;
5762 }
5763 str2 = PyUnicode_FromObject(replobj);
5764 if (str2 == NULL) {
5765 Py_DECREF(self);
5766 Py_DECREF(str1);
5767 return NULL;
5768 }
Tim Petersced69f82003-09-16 20:30:58 +00005769 result = replace((PyUnicodeObject *)self,
5770 (PyUnicodeObject *)str1,
5771 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 maxcount);
5773 Py_DECREF(self);
5774 Py_DECREF(str1);
5775 Py_DECREF(str2);
5776 return result;
5777}
5778
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005779PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780"S.replace (old, new[, maxsplit]) -> unicode\n\
5781\n\
5782Return a copy of S with all occurrences of substring\n\
5783old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005784given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785
5786static PyObject*
5787unicode_replace(PyUnicodeObject *self, PyObject *args)
5788{
5789 PyUnicodeObject *str1;
5790 PyUnicodeObject *str2;
5791 int maxcount = -1;
5792 PyObject *result;
5793
5794 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5795 return NULL;
5796 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5797 if (str1 == NULL)
5798 return NULL;
5799 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005800 if (str2 == NULL) {
5801 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005803 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804
5805 result = replace(self, str1, str2, maxcount);
5806
5807 Py_DECREF(str1);
5808 Py_DECREF(str2);
5809 return result;
5810}
5811
5812static
5813PyObject *unicode_repr(PyObject *unicode)
5814{
5815 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5816 PyUnicode_GET_SIZE(unicode),
5817 1);
5818}
5819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005820PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821"S.rfind(sub [,start [,end]]) -> int\n\
5822\n\
5823Return the highest index in S where substring sub is found,\n\
5824such that sub is contained within s[start,end]. Optional\n\
5825arguments start and end are interpreted as in slice notation.\n\
5826\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005827Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828
5829static PyObject *
5830unicode_rfind(PyUnicodeObject *self, PyObject *args)
5831{
5832 PyUnicodeObject *substring;
5833 int start = 0;
5834 int end = INT_MAX;
5835 PyObject *result;
5836
Guido van Rossumb8872e62000-05-09 14:14:27 +00005837 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5838 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839 return NULL;
5840 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5841 (PyObject *)substring);
5842 if (substring == NULL)
5843 return NULL;
5844
5845 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5846
5847 Py_DECREF(substring);
5848 return result;
5849}
5850
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005851PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852"S.rindex(sub [,start [,end]]) -> int\n\
5853\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005854Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855
5856static PyObject *
5857unicode_rindex(PyUnicodeObject *self, PyObject *args)
5858{
5859 int result;
5860 PyUnicodeObject *substring;
5861 int start = 0;
5862 int end = INT_MAX;
5863
Guido van Rossumb8872e62000-05-09 14:14:27 +00005864 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5865 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866 return NULL;
5867 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5868 (PyObject *)substring);
5869 if (substring == NULL)
5870 return NULL;
5871
5872 result = findstring(self, substring, start, end, -1);
5873
5874 Py_DECREF(substring);
5875 if (result < 0) {
5876 PyErr_SetString(PyExc_ValueError, "substring not found");
5877 return NULL;
5878 }
5879 return PyInt_FromLong(result);
5880}
5881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005882PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005883"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884\n\
5885Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005886done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887
5888static PyObject *
5889unicode_rjust(PyUnicodeObject *self, PyObject *args)
5890{
5891 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005892 Py_UNICODE fillchar = ' ';
5893
5894 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 return NULL;
5896
Tim Peters7a29bd52001-09-12 03:03:31 +00005897 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 Py_INCREF(self);
5899 return (PyObject*) self;
5900 }
5901
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005902 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903}
5904
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905static PyObject*
5906unicode_slice(PyUnicodeObject *self, int start, int end)
5907{
5908 /* standard clamping */
5909 if (start < 0)
5910 start = 0;
5911 if (end < 0)
5912 end = 0;
5913 if (end > self->length)
5914 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005915 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 /* full slice, return original string */
5917 Py_INCREF(self);
5918 return (PyObject*) self;
5919 }
5920 if (start > end)
5921 start = end;
5922 /* copy slice */
5923 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5924 end - start);
5925}
5926
5927PyObject *PyUnicode_Split(PyObject *s,
5928 PyObject *sep,
5929 int maxsplit)
5930{
5931 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005932
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 s = PyUnicode_FromObject(s);
5934 if (s == NULL)
5935 return NULL;
5936 if (sep != NULL) {
5937 sep = PyUnicode_FromObject(sep);
5938 if (sep == NULL) {
5939 Py_DECREF(s);
5940 return NULL;
5941 }
5942 }
5943
5944 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5945
5946 Py_DECREF(s);
5947 Py_XDECREF(sep);
5948 return result;
5949}
5950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005951PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952"S.split([sep [,maxsplit]]) -> list of strings\n\
5953\n\
5954Return a list of the words in S, using sep as the\n\
5955delimiter string. If maxsplit is given, at most maxsplit\n\
5956splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005957is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958
5959static PyObject*
5960unicode_split(PyUnicodeObject *self, PyObject *args)
5961{
5962 PyObject *substring = Py_None;
5963 int maxcount = -1;
5964
5965 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5966 return NULL;
5967
5968 if (substring == Py_None)
5969 return split(self, NULL, maxcount);
5970 else if (PyUnicode_Check(substring))
5971 return split(self, (PyUnicodeObject *)substring, maxcount);
5972 else
5973 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5974}
5975
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005976PyObject *PyUnicode_RSplit(PyObject *s,
5977 PyObject *sep,
5978 int maxsplit)
5979{
5980 PyObject *result;
5981
5982 s = PyUnicode_FromObject(s);
5983 if (s == NULL)
5984 return NULL;
5985 if (sep != NULL) {
5986 sep = PyUnicode_FromObject(sep);
5987 if (sep == NULL) {
5988 Py_DECREF(s);
5989 return NULL;
5990 }
5991 }
5992
5993 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5994
5995 Py_DECREF(s);
5996 Py_XDECREF(sep);
5997 return result;
5998}
5999
6000PyDoc_STRVAR(rsplit__doc__,
6001"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6002\n\
6003Return a list of the words in S, using sep as the\n\
6004delimiter string, starting at the end of the string and\n\
6005working to the front. If maxsplit is given, at most maxsplit\n\
6006splits are done. If sep is not specified, any whitespace string\n\
6007is a separator.");
6008
6009static PyObject*
6010unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6011{
6012 PyObject *substring = Py_None;
6013 int maxcount = -1;
6014
6015 if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
6016 return NULL;
6017
6018 if (substring == Py_None)
6019 return rsplit(self, NULL, maxcount);
6020 else if (PyUnicode_Check(substring))
6021 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6022 else
6023 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6024}
6025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006026PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006027"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028\n\
6029Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006030Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006031is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032
6033static PyObject*
6034unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6035{
Guido van Rossum86662912000-04-11 15:38:46 +00006036 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037
Guido van Rossum86662912000-04-11 15:38:46 +00006038 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 return NULL;
6040
Guido van Rossum86662912000-04-11 15:38:46 +00006041 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042}
6043
6044static
6045PyObject *unicode_str(PyUnicodeObject *self)
6046{
Fred Drakee4315f52000-05-09 19:53:39 +00006047 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048}
6049
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006050PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051"S.swapcase() -> unicode\n\
6052\n\
6053Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006054and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055
6056static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006057unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 return fixup(self, fixswapcase);
6060}
6061
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006062PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063"S.translate(table) -> unicode\n\
6064\n\
6065Return a copy of the string S, where all characters have been mapped\n\
6066through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006067Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6068Unmapped characters are left untouched. Characters mapped to None\n\
6069are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070
6071static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006072unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073{
Tim Petersced69f82003-09-16 20:30:58 +00006074 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006076 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 "ignore");
6078}
6079
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006080PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081"S.upper() -> unicode\n\
6082\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006083Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084
6085static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006086unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 return fixup(self, fixupper);
6089}
6090
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006091PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092"S.zfill(width) -> unicode\n\
6093\n\
6094Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006095of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096
6097static PyObject *
6098unicode_zfill(PyUnicodeObject *self, PyObject *args)
6099{
6100 int fill;
6101 PyUnicodeObject *u;
6102
6103 int width;
6104 if (!PyArg_ParseTuple(args, "i:zfill", &width))
6105 return NULL;
6106
6107 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006108 if (PyUnicode_CheckExact(self)) {
6109 Py_INCREF(self);
6110 return (PyObject*) self;
6111 }
6112 else
6113 return PyUnicode_FromUnicode(
6114 PyUnicode_AS_UNICODE(self),
6115 PyUnicode_GET_SIZE(self)
6116 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 }
6118
6119 fill = width - self->length;
6120
6121 u = pad(self, fill, 0, '0');
6122
Walter Dörwald068325e2002-04-15 13:36:47 +00006123 if (u == NULL)
6124 return NULL;
6125
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 if (u->str[fill] == '+' || u->str[fill] == '-') {
6127 /* move sign to beginning of string */
6128 u->str[0] = u->str[fill];
6129 u->str[fill] = '0';
6130 }
6131
6132 return (PyObject*) u;
6133}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134
6135#if 0
6136static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006137unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139 return PyInt_FromLong(unicode_freelist_size);
6140}
6141#endif
6142
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006143PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006144"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006146Return True if S starts with the specified prefix, False otherwise.\n\
6147With optional start, test S beginning at that position.\n\
6148With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149
6150static PyObject *
6151unicode_startswith(PyUnicodeObject *self,
6152 PyObject *args)
6153{
6154 PyUnicodeObject *substring;
6155 int start = 0;
6156 int end = INT_MAX;
6157 PyObject *result;
6158
Guido van Rossumb8872e62000-05-09 14:14:27 +00006159 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6160 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 return NULL;
6162 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6163 (PyObject *)substring);
6164 if (substring == NULL)
6165 return NULL;
6166
Guido van Rossum77f6a652002-04-03 22:41:51 +00006167 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168
6169 Py_DECREF(substring);
6170 return result;
6171}
6172
6173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006174PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006175"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006177Return True if S ends with the specified suffix, False otherwise.\n\
6178With optional start, test S beginning at that position.\n\
6179With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180
6181static PyObject *
6182unicode_endswith(PyUnicodeObject *self,
6183 PyObject *args)
6184{
6185 PyUnicodeObject *substring;
6186 int start = 0;
6187 int end = INT_MAX;
6188 PyObject *result;
6189
Guido van Rossumb8872e62000-05-09 14:14:27 +00006190 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6191 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192 return NULL;
6193 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6194 (PyObject *)substring);
6195 if (substring == NULL)
6196 return NULL;
6197
Guido van Rossum77f6a652002-04-03 22:41:51 +00006198 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199
6200 Py_DECREF(substring);
6201 return result;
6202}
6203
6204
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006205
6206static PyObject *
6207unicode_getnewargs(PyUnicodeObject *v)
6208{
6209 return Py_BuildValue("(u#)", v->str, v->length);
6210}
6211
6212
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213static PyMethodDef unicode_methods[] = {
6214
6215 /* Order is according to common usage: often used methods should
6216 appear first, since lookup is done sequentially. */
6217
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006218 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6219 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6220 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006221 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006222 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6223 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6224 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6225 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6226 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6227 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6228 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6229 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6230 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6231 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006232 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006233 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006234/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6235 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6236 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6237 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006238 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006239 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006240 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006241 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6242 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6243 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6244 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6245 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6246 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6247 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6248 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6249 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6250 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6251 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6252 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6253 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6254 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006255 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006256#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006257 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258#endif
6259
6260#if 0
6261 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006262 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263#endif
6264
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006265 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266 {NULL, NULL}
6267};
6268
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006269static PyObject *
6270unicode_mod(PyObject *v, PyObject *w)
6271{
6272 if (!PyUnicode_Check(v)) {
6273 Py_INCREF(Py_NotImplemented);
6274 return Py_NotImplemented;
6275 }
6276 return PyUnicode_Format(v, w);
6277}
6278
6279static PyNumberMethods unicode_as_number = {
6280 0, /*nb_add*/
6281 0, /*nb_subtract*/
6282 0, /*nb_multiply*/
6283 0, /*nb_divide*/
6284 unicode_mod, /*nb_remainder*/
6285};
6286
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287static PySequenceMethods unicode_as_sequence = {
6288 (inquiry) unicode_length, /* sq_length */
6289 (binaryfunc) PyUnicode_Concat, /* sq_concat */
6290 (intargfunc) unicode_repeat, /* sq_repeat */
6291 (intargfunc) unicode_getitem, /* sq_item */
6292 (intintargfunc) unicode_slice, /* sq_slice */
6293 0, /* sq_ass_item */
6294 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00006295 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296};
6297
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006298static PyObject*
6299unicode_subscript(PyUnicodeObject* self, PyObject* item)
6300{
6301 if (PyInt_Check(item)) {
6302 long i = PyInt_AS_LONG(item);
6303 if (i < 0)
6304 i += PyString_GET_SIZE(self);
6305 return unicode_getitem(self, i);
6306 } else if (PyLong_Check(item)) {
6307 long i = PyLong_AsLong(item);
6308 if (i == -1 && PyErr_Occurred())
6309 return NULL;
6310 if (i < 0)
6311 i += PyString_GET_SIZE(self);
6312 return unicode_getitem(self, i);
6313 } else if (PySlice_Check(item)) {
6314 int start, stop, step, slicelength, cur, i;
6315 Py_UNICODE* source_buf;
6316 Py_UNICODE* result_buf;
6317 PyObject* result;
6318
6319 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
6320 &start, &stop, &step, &slicelength) < 0) {
6321 return NULL;
6322 }
6323
6324 if (slicelength <= 0) {
6325 return PyUnicode_FromUnicode(NULL, 0);
6326 } else {
6327 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6328 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
6329
6330 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6331 result_buf[i] = source_buf[cur];
6332 }
Tim Petersced69f82003-09-16 20:30:58 +00006333
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006334 result = PyUnicode_FromUnicode(result_buf, slicelength);
6335 PyMem_FREE(result_buf);
6336 return result;
6337 }
6338 } else {
6339 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6340 return NULL;
6341 }
6342}
6343
6344static PyMappingMethods unicode_as_mapping = {
6345 (inquiry)unicode_length, /* mp_length */
6346 (binaryfunc)unicode_subscript, /* mp_subscript */
6347 (objobjargproc)0, /* mp_ass_subscript */
6348};
6349
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350static int
6351unicode_buffer_getreadbuf(PyUnicodeObject *self,
6352 int index,
6353 const void **ptr)
6354{
6355 if (index != 0) {
6356 PyErr_SetString(PyExc_SystemError,
6357 "accessing non-existent unicode segment");
6358 return -1;
6359 }
6360 *ptr = (void *) self->str;
6361 return PyUnicode_GET_DATA_SIZE(self);
6362}
6363
6364static int
6365unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6366 const void **ptr)
6367{
6368 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006369 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370 return -1;
6371}
6372
6373static int
6374unicode_buffer_getsegcount(PyUnicodeObject *self,
6375 int *lenp)
6376{
6377 if (lenp)
6378 *lenp = PyUnicode_GET_DATA_SIZE(self);
6379 return 1;
6380}
6381
6382static int
6383unicode_buffer_getcharbuf(PyUnicodeObject *self,
6384 int index,
6385 const void **ptr)
6386{
6387 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006388
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389 if (index != 0) {
6390 PyErr_SetString(PyExc_SystemError,
6391 "accessing non-existent unicode segment");
6392 return -1;
6393 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006394 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 if (str == NULL)
6396 return -1;
6397 *ptr = (void *) PyString_AS_STRING(str);
6398 return PyString_GET_SIZE(str);
6399}
6400
6401/* Helpers for PyUnicode_Format() */
6402
6403static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006404getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405{
6406 int argidx = *p_argidx;
6407 if (argidx < arglen) {
6408 (*p_argidx)++;
6409 if (arglen < 0)
6410 return args;
6411 else
6412 return PyTuple_GetItem(args, argidx);
6413 }
6414 PyErr_SetString(PyExc_TypeError,
6415 "not enough arguments for format string");
6416 return NULL;
6417}
6418
6419#define F_LJUST (1<<0)
6420#define F_SIGN (1<<1)
6421#define F_BLANK (1<<2)
6422#define F_ALT (1<<3)
6423#define F_ZERO (1<<4)
6424
6425static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427{
6428 register int i;
6429 int len;
6430 va_list va;
6431 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433
6434 /* First, format the string as char array, then expand to Py_UNICODE
6435 array. */
6436 charbuffer = (char *)buffer;
6437 len = vsprintf(charbuffer, format, va);
6438 for (i = len - 1; i >= 0; i--)
6439 buffer[i] = (Py_UNICODE) charbuffer[i];
6440
6441 va_end(va);
6442 return len;
6443}
6444
Guido van Rossum078151d2002-08-11 04:24:12 +00006445/* XXX To save some code duplication, formatfloat/long/int could have been
6446 shared with stringobject.c, converting from 8-bit to Unicode after the
6447 formatting is done. */
6448
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449static int
6450formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006451 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 int flags,
6453 int prec,
6454 int type,
6455 PyObject *v)
6456{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006457 /* fmt = '%#.' + `prec` + `type`
6458 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 char fmt[20];
6460 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006461
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 x = PyFloat_AsDouble(v);
6463 if (x == -1.0 && PyErr_Occurred())
6464 return -1;
6465 if (prec < 0)
6466 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6468 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006469 /* Worst case length calc to ensure no buffer overrun:
6470
6471 'g' formats:
6472 fmt = %#.<prec>g
6473 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6474 for any double rep.)
6475 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6476
6477 'f' formats:
6478 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6479 len = 1 + 50 + 1 + prec = 52 + prec
6480
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006481 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006482 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006483
6484 */
6485 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6486 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006487 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006488 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006489 return -1;
6490 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006491 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6492 (flags&F_ALT) ? "#" : "",
6493 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494 return usprintf(buf, fmt, x);
6495}
6496
Tim Peters38fd5b62000-09-21 05:43:11 +00006497static PyObject*
6498formatlong(PyObject *val, int flags, int prec, int type)
6499{
6500 char *buf;
6501 int i, len;
6502 PyObject *str; /* temporary string object. */
6503 PyUnicodeObject *result;
6504
6505 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6506 if (!str)
6507 return NULL;
6508 result = _PyUnicode_New(len);
6509 for (i = 0; i < len; i++)
6510 result->str[i] = buf[i];
6511 result->str[len] = 0;
6512 Py_DECREF(str);
6513 return (PyObject*)result;
6514}
6515
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516static int
6517formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006518 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 int flags,
6520 int prec,
6521 int type,
6522 PyObject *v)
6523{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006524 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006525 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6526 * + 1 + 1
6527 * = 24
6528 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006529 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006530 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531 long x;
6532
6533 x = PyInt_AsLong(v);
6534 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006535 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006536 if (x < 0 && type == 'u') {
6537 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006538 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006539 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6540 sign = "-";
6541 else
6542 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006544 prec = 1;
6545
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006546 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6547 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006548 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006549 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006550 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006551 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006552 return -1;
6553 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006554
6555 if ((flags & F_ALT) &&
6556 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006557 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006558 * of issues that cause pain:
6559 * - when 0 is being converted, the C standard leaves off
6560 * the '0x' or '0X', which is inconsistent with other
6561 * %#x/%#X conversions and inconsistent with Python's
6562 * hex() function
6563 * - there are platforms that violate the standard and
6564 * convert 0 with the '0x' or '0X'
6565 * (Metrowerks, Compaq Tru64)
6566 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006567 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006568 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006569 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006570 * We can achieve the desired consistency by inserting our
6571 * own '0x' or '0X' prefix, and substituting %x/%X in place
6572 * of %#x/%#X.
6573 *
6574 * Note that this is the same approach as used in
6575 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006576 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006577 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6578 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006579 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006580 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006581 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6582 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006583 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006584 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006585 if (sign[0])
6586 return usprintf(buf, fmt, -x);
6587 else
6588 return usprintf(buf, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589}
6590
6591static int
6592formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006593 size_t buflen,
6594 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006596 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006597 if (PyUnicode_Check(v)) {
6598 if (PyUnicode_GET_SIZE(v) != 1)
6599 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006601 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006603 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006604 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006605 goto onError;
6606 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6607 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608
6609 else {
6610 /* Integer input truncated to a character */
6611 long x;
6612 x = PyInt_AsLong(v);
6613 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006614 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006615#ifdef Py_UNICODE_WIDE
6616 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006617 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006618 "%c arg not in range(0x110000) "
6619 "(wide Python build)");
6620 return -1;
6621 }
6622#else
6623 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006624 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006625 "%c arg not in range(0x10000) "
6626 "(narrow Python build)");
6627 return -1;
6628 }
6629#endif
6630 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 }
6632 buf[1] = '\0';
6633 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006634
6635 onError:
6636 PyErr_SetString(PyExc_TypeError,
6637 "%c requires int or char");
6638 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639}
6640
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006641/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6642
6643 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6644 chars are formatted. XXX This is a magic number. Each formatting
6645 routine does bounds checking to ensure no overflow, but a better
6646 solution may be to malloc a buffer of appropriate size for each
6647 format. For now, the current solution is sufficient.
6648*/
6649#define FORMATBUFLEN (size_t)120
6650
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651PyObject *PyUnicode_Format(PyObject *format,
6652 PyObject *args)
6653{
6654 Py_UNICODE *fmt, *res;
6655 int fmtcnt, rescnt, reslen, arglen, argidx;
6656 int args_owned = 0;
6657 PyUnicodeObject *result = NULL;
6658 PyObject *dict = NULL;
6659 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006660
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661 if (format == NULL || args == NULL) {
6662 PyErr_BadInternalCall();
6663 return NULL;
6664 }
6665 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006666 if (uformat == NULL)
6667 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 fmt = PyUnicode_AS_UNICODE(uformat);
6669 fmtcnt = PyUnicode_GET_SIZE(uformat);
6670
6671 reslen = rescnt = fmtcnt + 100;
6672 result = _PyUnicode_New(reslen);
6673 if (result == NULL)
6674 goto onError;
6675 res = PyUnicode_AS_UNICODE(result);
6676
6677 if (PyTuple_Check(args)) {
6678 arglen = PyTuple_Size(args);
6679 argidx = 0;
6680 }
6681 else {
6682 arglen = -1;
6683 argidx = -2;
6684 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006685 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6686 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687 dict = args;
6688
6689 while (--fmtcnt >= 0) {
6690 if (*fmt != '%') {
6691 if (--rescnt < 0) {
6692 rescnt = fmtcnt + 100;
6693 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006694 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 return NULL;
6696 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6697 --rescnt;
6698 }
6699 *res++ = *fmt++;
6700 }
6701 else {
6702 /* Got a format specifier */
6703 int flags = 0;
6704 int width = -1;
6705 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706 Py_UNICODE c = '\0';
6707 Py_UNICODE fill;
6708 PyObject *v = NULL;
6709 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006710 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711 Py_UNICODE sign;
6712 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006713 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714
6715 fmt++;
6716 if (*fmt == '(') {
6717 Py_UNICODE *keystart;
6718 int keylen;
6719 PyObject *key;
6720 int pcount = 1;
6721
6722 if (dict == NULL) {
6723 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006724 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725 goto onError;
6726 }
6727 ++fmt;
6728 --fmtcnt;
6729 keystart = fmt;
6730 /* Skip over balanced parentheses */
6731 while (pcount > 0 && --fmtcnt >= 0) {
6732 if (*fmt == ')')
6733 --pcount;
6734 else if (*fmt == '(')
6735 ++pcount;
6736 fmt++;
6737 }
6738 keylen = fmt - keystart - 1;
6739 if (fmtcnt < 0 || pcount > 0) {
6740 PyErr_SetString(PyExc_ValueError,
6741 "incomplete format key");
6742 goto onError;
6743 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006744#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006745 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746 then looked up since Python uses strings to hold
6747 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006748 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 key = PyUnicode_EncodeUTF8(keystart,
6750 keylen,
6751 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006752#else
6753 key = PyUnicode_FromUnicode(keystart, keylen);
6754#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755 if (key == NULL)
6756 goto onError;
6757 if (args_owned) {
6758 Py_DECREF(args);
6759 args_owned = 0;
6760 }
6761 args = PyObject_GetItem(dict, key);
6762 Py_DECREF(key);
6763 if (args == NULL) {
6764 goto onError;
6765 }
6766 args_owned = 1;
6767 arglen = -1;
6768 argidx = -2;
6769 }
6770 while (--fmtcnt >= 0) {
6771 switch (c = *fmt++) {
6772 case '-': flags |= F_LJUST; continue;
6773 case '+': flags |= F_SIGN; continue;
6774 case ' ': flags |= F_BLANK; continue;
6775 case '#': flags |= F_ALT; continue;
6776 case '0': flags |= F_ZERO; continue;
6777 }
6778 break;
6779 }
6780 if (c == '*') {
6781 v = getnextarg(args, arglen, &argidx);
6782 if (v == NULL)
6783 goto onError;
6784 if (!PyInt_Check(v)) {
6785 PyErr_SetString(PyExc_TypeError,
6786 "* wants int");
6787 goto onError;
6788 }
6789 width = PyInt_AsLong(v);
6790 if (width < 0) {
6791 flags |= F_LJUST;
6792 width = -width;
6793 }
6794 if (--fmtcnt >= 0)
6795 c = *fmt++;
6796 }
6797 else if (c >= '0' && c <= '9') {
6798 width = c - '0';
6799 while (--fmtcnt >= 0) {
6800 c = *fmt++;
6801 if (c < '0' || c > '9')
6802 break;
6803 if ((width*10) / 10 != width) {
6804 PyErr_SetString(PyExc_ValueError,
6805 "width too big");
6806 goto onError;
6807 }
6808 width = width*10 + (c - '0');
6809 }
6810 }
6811 if (c == '.') {
6812 prec = 0;
6813 if (--fmtcnt >= 0)
6814 c = *fmt++;
6815 if (c == '*') {
6816 v = getnextarg(args, arglen, &argidx);
6817 if (v == NULL)
6818 goto onError;
6819 if (!PyInt_Check(v)) {
6820 PyErr_SetString(PyExc_TypeError,
6821 "* wants int");
6822 goto onError;
6823 }
6824 prec = PyInt_AsLong(v);
6825 if (prec < 0)
6826 prec = 0;
6827 if (--fmtcnt >= 0)
6828 c = *fmt++;
6829 }
6830 else if (c >= '0' && c <= '9') {
6831 prec = c - '0';
6832 while (--fmtcnt >= 0) {
6833 c = Py_CHARMASK(*fmt++);
6834 if (c < '0' || c > '9')
6835 break;
6836 if ((prec*10) / 10 != prec) {
6837 PyErr_SetString(PyExc_ValueError,
6838 "prec too big");
6839 goto onError;
6840 }
6841 prec = prec*10 + (c - '0');
6842 }
6843 }
6844 } /* prec */
6845 if (fmtcnt >= 0) {
6846 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847 if (--fmtcnt >= 0)
6848 c = *fmt++;
6849 }
6850 }
6851 if (fmtcnt < 0) {
6852 PyErr_SetString(PyExc_ValueError,
6853 "incomplete format");
6854 goto onError;
6855 }
6856 if (c != '%') {
6857 v = getnextarg(args, arglen, &argidx);
6858 if (v == NULL)
6859 goto onError;
6860 }
6861 sign = 0;
6862 fill = ' ';
6863 switch (c) {
6864
6865 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006866 pbuf = formatbuf;
6867 /* presume that buffer length is at least 1 */
6868 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 len = 1;
6870 break;
6871
6872 case 's':
6873 case 'r':
6874 if (PyUnicode_Check(v) && c == 's') {
6875 temp = v;
6876 Py_INCREF(temp);
6877 }
6878 else {
6879 PyObject *unicode;
6880 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006881 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882 else
6883 temp = PyObject_Repr(v);
6884 if (temp == NULL)
6885 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006886 if (PyUnicode_Check(temp))
6887 /* nothing to do */;
6888 else if (PyString_Check(temp)) {
6889 /* convert to string to Unicode */
Fred Drakee4315f52000-05-09 19:53:39 +00006890 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006892 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 "strict");
6894 Py_DECREF(temp);
6895 temp = unicode;
6896 if (temp == NULL)
6897 goto onError;
6898 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006899 else {
6900 Py_DECREF(temp);
6901 PyErr_SetString(PyExc_TypeError,
6902 "%s argument has non-string str()");
6903 goto onError;
6904 }
6905 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006906 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 len = PyUnicode_GET_SIZE(temp);
6908 if (prec >= 0 && len > prec)
6909 len = prec;
6910 break;
6911
6912 case 'i':
6913 case 'd':
6914 case 'u':
6915 case 'o':
6916 case 'x':
6917 case 'X':
6918 if (c == 'i')
6919 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006920 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006921 temp = formatlong(v, flags, prec, c);
6922 if (!temp)
6923 goto onError;
6924 pbuf = PyUnicode_AS_UNICODE(temp);
6925 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00006926 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006928 else {
6929 pbuf = formatbuf;
6930 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6931 flags, prec, c, v);
6932 if (len < 0)
6933 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006934 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006935 }
6936 if (flags & F_ZERO)
6937 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 break;
6939
6940 case 'e':
6941 case 'E':
6942 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006943 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944 case 'g':
6945 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006946 if (c == 'F')
6947 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006948 pbuf = formatbuf;
6949 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6950 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 if (len < 0)
6952 goto onError;
6953 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006954 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955 fill = '0';
6956 break;
6957
6958 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006959 pbuf = formatbuf;
6960 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961 if (len < 0)
6962 goto onError;
6963 break;
6964
6965 default:
6966 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006967 "unsupported format character '%c' (0x%x) "
6968 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00006969 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006970 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006971 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 goto onError;
6973 }
6974 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006975 if (*pbuf == '-' || *pbuf == '+') {
6976 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977 len--;
6978 }
6979 else if (flags & F_SIGN)
6980 sign = '+';
6981 else if (flags & F_BLANK)
6982 sign = ' ';
6983 else
6984 sign = 0;
6985 }
6986 if (width < len)
6987 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006988 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989 reslen -= rescnt;
6990 rescnt = width + fmtcnt + 100;
6991 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006992 if (reslen < 0) {
6993 Py_DECREF(result);
6994 return PyErr_NoMemory();
6995 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006996 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997 return NULL;
6998 res = PyUnicode_AS_UNICODE(result)
6999 + reslen - rescnt;
7000 }
7001 if (sign) {
7002 if (fill != ' ')
7003 *res++ = sign;
7004 rescnt--;
7005 if (width > len)
7006 width--;
7007 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007008 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7009 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007010 assert(pbuf[1] == c);
7011 if (fill != ' ') {
7012 *res++ = *pbuf++;
7013 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007014 }
Tim Petersfff53252001-04-12 18:38:48 +00007015 rescnt -= 2;
7016 width -= 2;
7017 if (width < 0)
7018 width = 0;
7019 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007020 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 if (width > len && !(flags & F_LJUST)) {
7022 do {
7023 --rescnt;
7024 *res++ = fill;
7025 } while (--width > len);
7026 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007027 if (fill == ' ') {
7028 if (sign)
7029 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007030 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007031 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007032 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007033 *res++ = *pbuf++;
7034 *res++ = *pbuf++;
7035 }
7036 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007037 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038 res += len;
7039 rescnt -= len;
7040 while (--width >= len) {
7041 --rescnt;
7042 *res++ = ' ';
7043 }
7044 if (dict && (argidx < arglen) && c != '%') {
7045 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007046 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047 goto onError;
7048 }
7049 Py_XDECREF(temp);
7050 } /* '%' */
7051 } /* until end */
7052 if (argidx < arglen && !dict) {
7053 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007054 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055 goto onError;
7056 }
7057
7058 if (args_owned) {
7059 Py_DECREF(args);
7060 }
7061 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00007062 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007063 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 return (PyObject *)result;
7065
7066 onError:
7067 Py_XDECREF(result);
7068 Py_DECREF(uformat);
7069 if (args_owned) {
7070 Py_DECREF(args);
7071 }
7072 return NULL;
7073}
7074
7075static PyBufferProcs unicode_as_buffer = {
7076 (getreadbufferproc) unicode_buffer_getreadbuf,
7077 (getwritebufferproc) unicode_buffer_getwritebuf,
7078 (getsegcountproc) unicode_buffer_getsegcount,
7079 (getcharbufferproc) unicode_buffer_getcharbuf,
7080};
7081
Jeremy Hylton938ace62002-07-17 16:30:39 +00007082static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007083unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7084
Tim Peters6d6c1a32001-08-02 04:15:00 +00007085static PyObject *
7086unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7087{
7088 PyObject *x = NULL;
7089 static char *kwlist[] = {"string", "encoding", "errors", 0};
7090 char *encoding = NULL;
7091 char *errors = NULL;
7092
Guido van Rossume023fe02001-08-30 03:12:59 +00007093 if (type != &PyUnicode_Type)
7094 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007095 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7096 kwlist, &x, &encoding, &errors))
7097 return NULL;
7098 if (x == NULL)
7099 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007100 if (encoding == NULL && errors == NULL)
7101 return PyObject_Unicode(x);
7102 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007103 return PyUnicode_FromEncodedObject(x, encoding, errors);
7104}
7105
Guido van Rossume023fe02001-08-30 03:12:59 +00007106static PyObject *
7107unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7108{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007109 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007110 int n;
7111
7112 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7113 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7114 if (tmp == NULL)
7115 return NULL;
7116 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007117 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007118 if (pnew == NULL) {
7119 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007120 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007121 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007122 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7123 if (pnew->str == NULL) {
7124 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007125 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007126 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007127 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007128 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007129 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7130 pnew->length = n;
7131 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007132 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007133 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007134}
7135
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007136PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007137"unicode(string [, encoding[, errors]]) -> object\n\
7138\n\
7139Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007140encoding defaults to the current default string encoding.\n\
7141errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007142
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143PyTypeObject PyUnicode_Type = {
7144 PyObject_HEAD_INIT(&PyType_Type)
7145 0, /* ob_size */
7146 "unicode", /* tp_name */
7147 sizeof(PyUnicodeObject), /* tp_size */
7148 0, /* tp_itemsize */
7149 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007150 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007152 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153 0, /* tp_setattr */
7154 (cmpfunc) unicode_compare, /* tp_compare */
7155 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007156 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007158 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 (hashfunc) unicode_hash, /* tp_hash*/
7160 0, /* tp_call*/
7161 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007162 PyObject_GenericGetAttr, /* tp_getattro */
7163 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007165 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7166 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007167 unicode_doc, /* tp_doc */
7168 0, /* tp_traverse */
7169 0, /* tp_clear */
7170 0, /* tp_richcompare */
7171 0, /* tp_weaklistoffset */
7172 0, /* tp_iter */
7173 0, /* tp_iternext */
7174 unicode_methods, /* tp_methods */
7175 0, /* tp_members */
7176 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007177 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007178 0, /* tp_dict */
7179 0, /* tp_descr_get */
7180 0, /* tp_descr_set */
7181 0, /* tp_dictoffset */
7182 0, /* tp_init */
7183 0, /* tp_alloc */
7184 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007185 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186};
7187
7188/* Initialize the Unicode implementation */
7189
Thomas Wouters78890102000-07-22 19:25:51 +00007190void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007192 int i;
7193
Fred Drakee4315f52000-05-09 19:53:39 +00007194 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007195 unicode_freelist = NULL;
7196 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007198 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007199 for (i = 0; i < 256; i++)
7200 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007201 if (PyType_Ready(&PyUnicode_Type) < 0)
7202 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203}
7204
7205/* Finalize the Unicode implementation */
7206
7207void
Thomas Wouters78890102000-07-22 19:25:51 +00007208_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007210 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007211 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007213 Py_XDECREF(unicode_empty);
7214 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007215
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007216 for (i = 0; i < 256; i++) {
7217 if (unicode_latin1[i]) {
7218 Py_DECREF(unicode_latin1[i]);
7219 unicode_latin1[i] = NULL;
7220 }
7221 }
7222
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007223 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 PyUnicodeObject *v = u;
7225 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007226 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007227 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007228 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007229 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007231 unicode_freelist = NULL;
7232 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007234
7235/*
7236Local variables:
7237c-basic-offset: 4
7238indent-tabs-mode: nil
7239End:
7240*/