blob: 670fca4d51eea22481d10874fc096303a60e1931 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
157 unicode->length = length;
158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000266int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
371 for (i = size; i >= 0; i--)
372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
387 if (size > PyUnicode_GET_SIZE(unicode))
388 size = PyUnicode_GET_SIZE(unicode);
389#ifdef HAVE_USABLE_WCHAR_T
390 memcpy(w, unicode->str, size * sizeof(wchar_t));
391#else
392 {
393 register Py_UNICODE *u;
394 register int i;
395 u = PyUnicode_AS_UNICODE(unicode);
396 for (i = size; i >= 0; i--)
397 *w++ = *u++;
398 }
399#endif
400
401 return size;
402}
403
404#endif
405
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000406PyObject *PyUnicode_FromOrdinal(int ordinal)
407{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000408 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000409
410#ifdef Py_UNICODE_WIDE
411 if (ordinal < 0 || ordinal > 0x10ffff) {
412 PyErr_SetString(PyExc_ValueError,
413 "unichr() arg not in range(0x110000) "
414 "(wide Python build)");
415 return NULL;
416 }
417#else
418 if (ordinal < 0 || ordinal > 0xffff) {
419 PyErr_SetString(PyExc_ValueError,
420 "unichr() arg not in range(0x10000) "
421 "(narrow Python build)");
422 return NULL;
423 }
424#endif
425
Hye-Shik Chang40574832004-04-06 07:24:51 +0000426 s[0] = (Py_UNICODE)ordinal;
427 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000428}
429
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430PyObject *PyUnicode_FromObject(register PyObject *obj)
431{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000432 /* XXX Perhaps we should make this API an alias of
433 PyObject_Unicode() instead ?! */
434 if (PyUnicode_CheckExact(obj)) {
435 Py_INCREF(obj);
436 return obj;
437 }
438 if (PyUnicode_Check(obj)) {
439 /* For a Unicode subtype that's not a Unicode object,
440 return a true Unicode object with the same data. */
441 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
442 PyUnicode_GET_SIZE(obj));
443 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000444 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
445}
446
447PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
448 const char *encoding,
449 const char *errors)
450{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000451 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000453 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000454
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455 if (obj == NULL) {
456 PyErr_BadInternalCall();
457 return NULL;
458 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000460#if 0
461 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000462 that no encodings is given and then redirect to
463 PyObject_Unicode() which then applies the additional logic for
464 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000465
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000466 NOTE: This API should really only be used for object which
467 represent *encoded* Unicode !
468
469 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000470 if (PyUnicode_Check(obj)) {
471 if (encoding) {
472 PyErr_SetString(PyExc_TypeError,
473 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000474 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000475 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000476 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000477 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000478#else
479 if (PyUnicode_Check(obj)) {
480 PyErr_SetString(PyExc_TypeError,
481 "decoding Unicode is not supported");
482 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000484#endif
485
486 /* Coerce object */
487 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000488 s = PyString_AS_STRING(obj);
489 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000490 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
492 /* Overwrite the error message with something more useful in
493 case of a TypeError. */
494 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000495 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000496 "coercing to Unicode: need string or buffer, "
497 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000498 obj->ob_type->tp_name);
499 goto onError;
500 }
Tim Petersced69f82003-09-16 20:30:58 +0000501
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000502 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 if (len == 0) {
504 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000505 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000509
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000510 return v;
511
512 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514}
515
516PyObject *PyUnicode_Decode(const char *s,
517 int size,
518 const char *encoding,
519 const char *errors)
520{
521 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000522
523 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000524 encoding = PyUnicode_GetDefaultEncoding();
525
526 /* Shortcuts for common default encodings */
527 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000529 else if (strcmp(encoding, "latin-1") == 0)
530 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000531#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
532 else if (strcmp(encoding, "mbcs") == 0)
533 return PyUnicode_DecodeMBCS(s, size, errors);
534#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000535 else if (strcmp(encoding, "ascii") == 0)
536 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000537
538 /* Decode via the codec registry */
539 buffer = PyBuffer_FromMemory((void *)s, size);
540 if (buffer == NULL)
541 goto onError;
542 unicode = PyCodec_Decode(buffer, encoding, errors);
543 if (unicode == NULL)
544 goto onError;
545 if (!PyUnicode_Check(unicode)) {
546 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000547 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548 unicode->ob_type->tp_name);
549 Py_DECREF(unicode);
550 goto onError;
551 }
552 Py_DECREF(buffer);
553 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000554
Guido van Rossumd57fd912000-03-10 22:53:23 +0000555 onError:
556 Py_XDECREF(buffer);
557 return NULL;
558}
559
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000560PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
561 const char *encoding,
562 const char *errors)
563{
564 PyObject *v;
565
566 if (!PyUnicode_Check(unicode)) {
567 PyErr_BadArgument();
568 goto onError;
569 }
570
571 if (encoding == NULL)
572 encoding = PyUnicode_GetDefaultEncoding();
573
574 /* Decode via the codec registry */
575 v = PyCodec_Decode(unicode, encoding, errors);
576 if (v == NULL)
577 goto onError;
578 return v;
579
580 onError:
581 return NULL;
582}
583
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584PyObject *PyUnicode_Encode(const Py_UNICODE *s,
585 int size,
586 const char *encoding,
587 const char *errors)
588{
589 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000590
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591 unicode = PyUnicode_FromUnicode(s, size);
592 if (unicode == NULL)
593 return NULL;
594 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
595 Py_DECREF(unicode);
596 return v;
597}
598
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000599PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
600 const char *encoding,
601 const char *errors)
602{
603 PyObject *v;
604
605 if (!PyUnicode_Check(unicode)) {
606 PyErr_BadArgument();
607 goto onError;
608 }
609
610 if (encoding == NULL)
611 encoding = PyUnicode_GetDefaultEncoding();
612
613 /* Encode via the codec registry */
614 v = PyCodec_Encode(unicode, encoding, errors);
615 if (v == NULL)
616 goto onError;
617 return v;
618
619 onError:
620 return NULL;
621}
622
Guido van Rossumd57fd912000-03-10 22:53:23 +0000623PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
624 const char *encoding,
625 const char *errors)
626{
627 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629 if (!PyUnicode_Check(unicode)) {
630 PyErr_BadArgument();
631 goto onError;
632 }
Fred Drakee4315f52000-05-09 19:53:39 +0000633
Tim Petersced69f82003-09-16 20:30:58 +0000634 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000635 encoding = PyUnicode_GetDefaultEncoding();
636
637 /* Shortcuts for common default encodings */
638 if (errors == NULL) {
639 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000640 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000641 else if (strcmp(encoding, "latin-1") == 0)
642 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000643#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
644 else if (strcmp(encoding, "mbcs") == 0)
645 return PyUnicode_AsMBCSString(unicode);
646#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000647 else if (strcmp(encoding, "ascii") == 0)
648 return PyUnicode_AsASCIIString(unicode);
649 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650
651 /* Encode via the codec registry */
652 v = PyCodec_Encode(unicode, encoding, errors);
653 if (v == NULL)
654 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 if (!PyString_Check(v)) {
656 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000657 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 v->ob_type->tp_name);
659 Py_DECREF(v);
660 goto onError;
661 }
662 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000663
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 onError:
665 return NULL;
666}
667
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000668PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
669 const char *errors)
670{
671 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
672
673 if (v)
674 return v;
675 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
676 if (v && errors == NULL)
677 ((PyUnicodeObject *)unicode)->defenc = v;
678 return v;
679}
680
Guido van Rossumd57fd912000-03-10 22:53:23 +0000681Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
682{
683 if (!PyUnicode_Check(unicode)) {
684 PyErr_BadArgument();
685 goto onError;
686 }
687 return PyUnicode_AS_UNICODE(unicode);
688
689 onError:
690 return NULL;
691}
692
693int PyUnicode_GetSize(PyObject *unicode)
694{
695 if (!PyUnicode_Check(unicode)) {
696 PyErr_BadArgument();
697 goto onError;
698 }
699 return PyUnicode_GET_SIZE(unicode);
700
701 onError:
702 return -1;
703}
704
Thomas Wouters78890102000-07-22 19:25:51 +0000705const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000706{
707 return unicode_default_encoding;
708}
709
710int PyUnicode_SetDefaultEncoding(const char *encoding)
711{
712 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000713
Fred Drakee4315f52000-05-09 19:53:39 +0000714 /* Make sure the encoding is valid. As side effect, this also
715 loads the encoding into the codec registry cache. */
716 v = _PyCodec_Lookup(encoding);
717 if (v == NULL)
718 goto onError;
719 Py_DECREF(v);
720 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000721 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000722 sizeof(unicode_default_encoding));
723 return 0;
724
725 onError:
726 return -1;
727}
728
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000729/* error handling callback helper:
730 build arguments, call the callback and check the arguments,
731 if no exception occured, copy the replacement to the output
732 and adjust various state variables.
733 return 0 on success, -1 on error
734*/
735
736static
737int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
738 const char *encoding, const char *reason,
739 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
740 PyObject **output, int *outpos, Py_UNICODE **outptr)
741{
742 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
743
744 PyObject *restuple = NULL;
745 PyObject *repunicode = NULL;
746 int outsize = PyUnicode_GET_SIZE(*output);
747 int requiredsize;
748 int newpos;
749 Py_UNICODE *repptr;
750 int repsize;
751 int res = -1;
752
753 if (*errorHandler == NULL) {
754 *errorHandler = PyCodec_LookupError(errors);
755 if (*errorHandler == NULL)
756 goto onError;
757 }
758
759 if (*exceptionObject == NULL) {
760 *exceptionObject = PyUnicodeDecodeError_Create(
761 encoding, input, insize, *startinpos, *endinpos, reason);
762 if (*exceptionObject == NULL)
763 goto onError;
764 }
765 else {
766 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
767 goto onError;
768 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
769 goto onError;
770 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
771 goto onError;
772 }
773
774 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
775 if (restuple == NULL)
776 goto onError;
777 if (!PyTuple_Check(restuple)) {
778 PyErr_Format(PyExc_TypeError, &argparse[4]);
779 goto onError;
780 }
781 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
782 goto onError;
783 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000784 newpos = insize+newpos;
785 if (newpos<0 || newpos>insize) {
786 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
787 goto onError;
788 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000789
790 /* need more space? (at least enough for what we
791 have+the replacement+the rest of the string (starting
792 at the new input position), so we won't have to check space
793 when there are no errors in the rest of the string) */
794 repptr = PyUnicode_AS_UNICODE(repunicode);
795 repsize = PyUnicode_GET_SIZE(repunicode);
796 requiredsize = *outpos + repsize + insize-newpos;
797 if (requiredsize > outsize) {
798 if (requiredsize<2*outsize)
799 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000800 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000801 goto onError;
802 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
803 }
804 *endinpos = newpos;
805 *inptr = input + newpos;
806 Py_UNICODE_COPY(*outptr, repptr, repsize);
807 *outptr += repsize;
808 *outpos += repsize;
809 /* we made it! */
810 res = 0;
811
812 onError:
813 Py_XDECREF(restuple);
814 return res;
815}
816
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000817/* --- UTF-7 Codec -------------------------------------------------------- */
818
819/* see RFC2152 for details */
820
Tim Petersced69f82003-09-16 20:30:58 +0000821static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000822char utf7_special[128] = {
823 /* indicate whether a UTF-7 character is special i.e. cannot be directly
824 encoded:
825 0 - not special
826 1 - special
827 2 - whitespace (optional)
828 3 - RFC2152 Set O (optional) */
829 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
830 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
831 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
832 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
833 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
834 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
835 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
836 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
837
838};
839
840#define SPECIAL(c, encodeO, encodeWS) \
841 (((c)>127 || utf7_special[(c)] == 1) || \
842 (encodeWS && (utf7_special[(c)] == 2)) || \
843 (encodeO && (utf7_special[(c)] == 3)))
844
845#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
846#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
847#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
848 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
849
850#define ENCODE(out, ch, bits) \
851 while (bits >= 6) { \
852 *out++ = B64(ch >> (bits-6)); \
853 bits -= 6; \
854 }
855
856#define DECODE(out, ch, bits, surrogate) \
857 while (bits >= 16) { \
858 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
859 bits -= 16; \
860 if (surrogate) { \
861 /* We have already generated an error for the high surrogate
862 so let's not bother seeing if the low surrogate is correct or not */\
863 surrogate = 0; \
864 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
865 /* This is a surrogate pair. Unfortunately we can't represent \
866 it in a 16-bit character */ \
867 surrogate = 1; \
868 errmsg = "code pairs are not supported"; \
869 goto utf7Error; \
870 } else { \
871 *out++ = outCh; \
872 } \
873 } \
874
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000875PyObject *PyUnicode_DecodeUTF7(const char *s,
876 int size,
877 const char *errors)
878{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000879 const char *starts = s;
880 int startinpos;
881 int endinpos;
882 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000883 const char *e;
884 PyUnicodeObject *unicode;
885 Py_UNICODE *p;
886 const char *errmsg = "";
887 int inShift = 0;
888 unsigned int bitsleft = 0;
889 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000890 int surrogate = 0;
891 PyObject *errorHandler = NULL;
892 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000893
894 unicode = _PyUnicode_New(size);
895 if (!unicode)
896 return NULL;
897 if (size == 0)
898 return (PyObject *)unicode;
899
900 p = unicode->str;
901 e = s + size;
902
903 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000904 Py_UNICODE ch;
905 restart:
906 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907
908 if (inShift) {
909 if ((ch == '-') || !B64CHAR(ch)) {
910 inShift = 0;
911 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000912
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000913 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
914 if (bitsleft >= 6) {
915 /* The shift sequence has a partial character in it. If
916 bitsleft < 6 then we could just classify it as padding
917 but that is not the case here */
918
919 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000920 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921 }
922 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000923 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000924 here so indicate the potential of a misencoded character. */
925
926 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
927 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
928 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000929 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000930 }
931
932 if (ch == '-') {
933 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000934 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000935 inShift = 1;
936 }
937 } else if (SPECIAL(ch,0,0)) {
938 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000939 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000940 } else {
941 *p++ = ch;
942 }
943 } else {
944 charsleft = (charsleft << 6) | UB64(ch);
945 bitsleft += 6;
946 s++;
947 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
948 }
949 }
950 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000951 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000952 s++;
953 if (s < e && *s == '-') {
954 s++;
955 *p++ = '+';
956 } else
957 {
958 inShift = 1;
959 bitsleft = 0;
960 }
961 }
962 else if (SPECIAL(ch,0,0)) {
963 errmsg = "unexpected special character";
964 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000965 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000966 }
967 else {
968 *p++ = ch;
969 s++;
970 }
971 continue;
972 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000973 outpos = p-PyUnicode_AS_UNICODE(unicode);
974 endinpos = s-starts;
975 if (unicode_decode_call_errorhandler(
976 errors, &errorHandler,
977 "utf7", errmsg,
978 starts, size, &startinpos, &endinpos, &exc, &s,
979 (PyObject **)&unicode, &outpos, &p))
980 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000981 }
982
983 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000984 outpos = p-PyUnicode_AS_UNICODE(unicode);
985 endinpos = size;
986 if (unicode_decode_call_errorhandler(
987 errors, &errorHandler,
988 "utf7", "unterminated shift sequence",
989 starts, size, &startinpos, &endinpos, &exc, &s,
990 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000991 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000992 if (s < e)
993 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000994 }
995
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000996 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 goto onError;
998
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000999 Py_XDECREF(errorHandler);
1000 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001001 return (PyObject *)unicode;
1002
1003onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001004 Py_XDECREF(errorHandler);
1005 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001006 Py_DECREF(unicode);
1007 return NULL;
1008}
1009
1010
1011PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1012 int size,
1013 int encodeSetO,
1014 int encodeWhiteSpace,
1015 const char *errors)
1016{
1017 PyObject *v;
1018 /* It might be possible to tighten this worst case */
1019 unsigned int cbAllocated = 5 * size;
1020 int inShift = 0;
1021 int i = 0;
1022 unsigned int bitsleft = 0;
1023 unsigned long charsleft = 0;
1024 char * out;
1025 char * start;
1026
1027 if (size == 0)
1028 return PyString_FromStringAndSize(NULL, 0);
1029
1030 v = PyString_FromStringAndSize(NULL, cbAllocated);
1031 if (v == NULL)
1032 return NULL;
1033
1034 start = out = PyString_AS_STRING(v);
1035 for (;i < size; ++i) {
1036 Py_UNICODE ch = s[i];
1037
1038 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001039 if (ch == '+') {
1040 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001041 *out++ = '-';
1042 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1043 charsleft = ch;
1044 bitsleft = 16;
1045 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001046 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001047 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001048 } else {
1049 *out++ = (char) ch;
1050 }
1051 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001052 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1053 *out++ = B64(charsleft << (6-bitsleft));
1054 charsleft = 0;
1055 bitsleft = 0;
1056 /* Characters not in the BASE64 set implicitly unshift the sequence
1057 so no '-' is required, except if the character is itself a '-' */
1058 if (B64CHAR(ch) || ch == '-') {
1059 *out++ = '-';
1060 }
1061 inShift = 0;
1062 *out++ = (char) ch;
1063 } else {
1064 bitsleft += 16;
1065 charsleft = (charsleft << 16) | ch;
1066 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1067
1068 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001069 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001070 or '-' then the shift sequence will be terminated implicitly and we
1071 don't have to insert a '-'. */
1072
1073 if (bitsleft == 0) {
1074 if (i + 1 < size) {
1075 Py_UNICODE ch2 = s[i+1];
1076
1077 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001078
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001079 } else if (B64CHAR(ch2) || ch2 == '-') {
1080 *out++ = '-';
1081 inShift = 0;
1082 } else {
1083 inShift = 0;
1084 }
1085
1086 }
1087 else {
1088 *out++ = '-';
1089 inShift = 0;
1090 }
1091 }
Tim Petersced69f82003-09-16 20:30:58 +00001092 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001093 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001094 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001095 if (bitsleft) {
1096 *out++= B64(charsleft << (6-bitsleft) );
1097 *out++ = '-';
1098 }
1099
Tim Peters5de98422002-04-27 18:44:32 +00001100 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001101 return v;
1102}
1103
1104#undef SPECIAL
1105#undef B64
1106#undef B64CHAR
1107#undef UB64
1108#undef ENCODE
1109#undef DECODE
1110
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111/* --- UTF-8 Codec -------------------------------------------------------- */
1112
Tim Petersced69f82003-09-16 20:30:58 +00001113static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114char utf8_code_length[256] = {
1115 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1116 illegal prefix. see RFC 2279 for details */
1117 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1118 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1119 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1120 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1121 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1122 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1124 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1126 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1129 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1130 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1131 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1132 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1133};
1134
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135PyObject *PyUnicode_DecodeUTF8(const char *s,
1136 int size,
1137 const char *errors)
1138{
Walter Dörwald69652032004-09-07 20:24:22 +00001139 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1140}
1141
1142PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1143 int size,
1144 const char *errors,
1145 int *consumed)
1146{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001147 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001149 int startinpos;
1150 int endinpos;
1151 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001152 const char *e;
1153 PyUnicodeObject *unicode;
1154 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001155 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001156 PyObject *errorHandler = NULL;
1157 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158
1159 /* Note: size will always be longer than the resulting Unicode
1160 character count */
1161 unicode = _PyUnicode_New(size);
1162 if (!unicode)
1163 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001164 if (size == 0) {
1165 if (consumed)
1166 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169
1170 /* Unpack UTF-8 encoded data */
1171 p = unicode->str;
1172 e = s + size;
1173
1174 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001175 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176
1177 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001178 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 s++;
1180 continue;
1181 }
1182
1183 n = utf8_code_length[ch];
1184
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001185 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001186 if (consumed)
1187 break;
1188 else {
1189 errmsg = "unexpected end of data";
1190 startinpos = s-starts;
1191 endinpos = size;
1192 goto utf8Error;
1193 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001194 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001195
1196 switch (n) {
1197
1198 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001199 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001200 startinpos = s-starts;
1201 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001202 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203
1204 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001205 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001206 startinpos = s-starts;
1207 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001208 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209
1210 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001211 if ((s[1] & 0xc0) != 0x80) {
1212 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001213 startinpos = s-starts;
1214 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001215 goto utf8Error;
1216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001218 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001219 startinpos = s-starts;
1220 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001221 errmsg = "illegal encoding";
1222 goto utf8Error;
1223 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001225 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226 break;
1227
1228 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001229 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001230 (s[2] & 0xc0) != 0x80) {
1231 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001232 startinpos = s-starts;
1233 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001234 goto utf8Error;
1235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001237 if (ch < 0x0800) {
1238 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001239 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001240
1241 XXX For wide builds (UCS-4) we should probably try
1242 to recombine the surrogates into a single code
1243 unit.
1244 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001245 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001246 startinpos = s-starts;
1247 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001248 goto utf8Error;
1249 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001251 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001252 break;
1253
1254 case 4:
1255 if ((s[1] & 0xc0) != 0x80 ||
1256 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001257 (s[3] & 0xc0) != 0x80) {
1258 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001259 startinpos = s-starts;
1260 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001261 goto utf8Error;
1262 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001263 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1264 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1265 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001266 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001267 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001268 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001269 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001270 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001271 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001272 startinpos = s-starts;
1273 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001274 goto utf8Error;
1275 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001276#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001277 *p++ = (Py_UNICODE)ch;
1278#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001279 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001280
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001281 /* translate from 10000..10FFFF to 0..FFFF */
1282 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001283
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001284 /* high surrogate = top 10 bits added to D800 */
1285 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001286
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001287 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001288 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001289#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001290 break;
1291
1292 default:
1293 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001294 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001295 startinpos = s-starts;
1296 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001298 }
1299 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001300 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001301
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001302 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001303 outpos = p-PyUnicode_AS_UNICODE(unicode);
1304 if (unicode_decode_call_errorhandler(
1305 errors, &errorHandler,
1306 "utf8", errmsg,
1307 starts, size, &startinpos, &endinpos, &exc, &s,
1308 (PyObject **)&unicode, &outpos, &p))
1309 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001310 }
Walter Dörwald69652032004-09-07 20:24:22 +00001311 if (consumed)
1312 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001313
1314 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001315 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316 goto onError;
1317
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001318 Py_XDECREF(errorHandler);
1319 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001320 return (PyObject *)unicode;
1321
1322onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001323 Py_XDECREF(errorHandler);
1324 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325 Py_DECREF(unicode);
1326 return NULL;
1327}
1328
Tim Peters602f7402002-04-27 18:03:26 +00001329/* Allocation strategy: if the string is short, convert into a stack buffer
1330 and allocate exactly as much space needed at the end. Else allocate the
1331 maximum possible needed (4 result bytes per Unicode character), and return
1332 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001333*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001334PyObject *
1335PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1336 int size,
1337 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001338{
Tim Peters602f7402002-04-27 18:03:26 +00001339#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001340
Tim Peters602f7402002-04-27 18:03:26 +00001341 int i; /* index into s of next input byte */
1342 PyObject *v; /* result string object */
1343 char *p; /* next free byte in output buffer */
1344 int nallocated; /* number of result bytes allocated */
1345 int nneeded; /* number of result bytes needed */
1346 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001347
Tim Peters602f7402002-04-27 18:03:26 +00001348 assert(s != NULL);
1349 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350
Tim Peters602f7402002-04-27 18:03:26 +00001351 if (size <= MAX_SHORT_UNICHARS) {
1352 /* Write into the stack buffer; nallocated can't overflow.
1353 * At the end, we'll allocate exactly as much heap space as it
1354 * turns out we need.
1355 */
1356 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1357 v = NULL; /* will allocate after we're done */
1358 p = stackbuf;
1359 }
1360 else {
1361 /* Overallocate on the heap, and give the excess back at the end. */
1362 nallocated = size * 4;
1363 if (nallocated / 4 != size) /* overflow! */
1364 return PyErr_NoMemory();
1365 v = PyString_FromStringAndSize(NULL, nallocated);
1366 if (v == NULL)
1367 return NULL;
1368 p = PyString_AS_STRING(v);
1369 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001370
Tim Peters602f7402002-04-27 18:03:26 +00001371 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001372 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001373
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001374 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001375 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001376 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001377
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001379 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001380 *p++ = (char)(0xc0 | (ch >> 6));
1381 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001382 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001383 else {
Tim Peters602f7402002-04-27 18:03:26 +00001384 /* Encode UCS2 Unicode ordinals */
1385 if (ch < 0x10000) {
1386 /* Special case: check for high surrogate */
1387 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1388 Py_UCS4 ch2 = s[i];
1389 /* Check for low surrogate and combine the two to
1390 form a UCS4 value */
1391 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001392 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001393 i++;
1394 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001395 }
Tim Peters602f7402002-04-27 18:03:26 +00001396 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001397 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001398 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001399 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1400 *p++ = (char)(0x80 | (ch & 0x3f));
1401 continue;
1402 }
1403encodeUCS4:
1404 /* Encode UCS4 Unicode ordinals */
1405 *p++ = (char)(0xf0 | (ch >> 18));
1406 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1407 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1408 *p++ = (char)(0x80 | (ch & 0x3f));
1409 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001411
Tim Peters602f7402002-04-27 18:03:26 +00001412 if (v == NULL) {
1413 /* This was stack allocated. */
1414 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1415 assert(nneeded <= nallocated);
1416 v = PyString_FromStringAndSize(stackbuf, nneeded);
1417 }
1418 else {
1419 /* Cut back to size actually needed. */
1420 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1421 assert(nneeded <= nallocated);
1422 _PyString_Resize(&v, nneeded);
1423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001425
Tim Peters602f7402002-04-27 18:03:26 +00001426#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001427}
1428
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1430{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001431 if (!PyUnicode_Check(unicode)) {
1432 PyErr_BadArgument();
1433 return NULL;
1434 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001435 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1436 PyUnicode_GET_SIZE(unicode),
1437 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001438}
1439
1440/* --- UTF-16 Codec ------------------------------------------------------- */
1441
Tim Peters772747b2001-08-09 22:21:55 +00001442PyObject *
1443PyUnicode_DecodeUTF16(const char *s,
1444 int size,
1445 const char *errors,
1446 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447{
Walter Dörwald69652032004-09-07 20:24:22 +00001448 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1449}
1450
1451PyObject *
1452PyUnicode_DecodeUTF16Stateful(const char *s,
1453 int size,
1454 const char *errors,
1455 int *byteorder,
1456 int *consumed)
1457{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001458 const char *starts = s;
1459 int startinpos;
1460 int endinpos;
1461 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001462 PyUnicodeObject *unicode;
1463 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001464 const unsigned char *q, *e;
1465 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001466 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001467 /* Offsets from q for retrieving byte pairs in the right order. */
1468#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1469 int ihi = 1, ilo = 0;
1470#else
1471 int ihi = 0, ilo = 1;
1472#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001473 PyObject *errorHandler = NULL;
1474 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001475
1476 /* Note: size will always be longer than the resulting Unicode
1477 character count */
1478 unicode = _PyUnicode_New(size);
1479 if (!unicode)
1480 return NULL;
1481 if (size == 0)
1482 return (PyObject *)unicode;
1483
1484 /* Unpack UTF-16 encoded data */
1485 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001486 q = (unsigned char *)s;
1487 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001488
1489 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001490 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001492 /* Check for BOM marks (U+FEFF) in the input and adjust current
1493 byte order setting accordingly. In native mode, the leading BOM
1494 mark is skipped, in all other modes, it is copied to the output
1495 stream as-is (giving a ZWNBSP character). */
1496 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001497 if (size >= 2) {
1498 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001499#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001500 if (bom == 0xFEFF) {
1501 q += 2;
1502 bo = -1;
1503 }
1504 else if (bom == 0xFFFE) {
1505 q += 2;
1506 bo = 1;
1507 }
Tim Petersced69f82003-09-16 20:30:58 +00001508#else
Walter Dörwald69652032004-09-07 20:24:22 +00001509 if (bom == 0xFEFF) {
1510 q += 2;
1511 bo = 1;
1512 }
1513 else if (bom == 0xFFFE) {
1514 q += 2;
1515 bo = -1;
1516 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001517#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001518 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001520
Tim Peters772747b2001-08-09 22:21:55 +00001521 if (bo == -1) {
1522 /* force LE */
1523 ihi = 1;
1524 ilo = 0;
1525 }
1526 else if (bo == 1) {
1527 /* force BE */
1528 ihi = 0;
1529 ilo = 1;
1530 }
1531
1532 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001533 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001534 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001535 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001536 if (consumed)
1537 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001538 errmsg = "truncated data";
1539 startinpos = ((const char *)q)-starts;
1540 endinpos = ((const char *)e)-starts;
1541 goto utf16Error;
1542 /* The remaining input chars are ignored if the callback
1543 chooses to skip the input */
1544 }
1545 ch = (q[ihi] << 8) | q[ilo];
1546
Tim Peters772747b2001-08-09 22:21:55 +00001547 q += 2;
1548
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549 if (ch < 0xD800 || ch > 0xDFFF) {
1550 *p++ = ch;
1551 continue;
1552 }
1553
1554 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001555 if (q >= e) {
1556 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001557 startinpos = (((const char *)q)-2)-starts;
1558 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001559 goto utf16Error;
1560 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001561 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001562 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1563 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001564 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001565#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001566 *p++ = ch;
1567 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001568#else
1569 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001570#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001571 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001572 }
1573 else {
1574 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001575 startinpos = (((const char *)q)-4)-starts;
1576 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001577 goto utf16Error;
1578 }
1579
Guido van Rossumd57fd912000-03-10 22:53:23 +00001580 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001581 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001582 startinpos = (((const char *)q)-2)-starts;
1583 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001584 /* Fall through to report the error */
1585
1586 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001587 outpos = p-PyUnicode_AS_UNICODE(unicode);
1588 if (unicode_decode_call_errorhandler(
1589 errors, &errorHandler,
1590 "utf16", errmsg,
1591 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1592 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001593 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001594 }
1595
1596 if (byteorder)
1597 *byteorder = bo;
1598
Walter Dörwald69652032004-09-07 20:24:22 +00001599 if (consumed)
1600 *consumed = (const char *)q-starts;
1601
Guido van Rossumd57fd912000-03-10 22:53:23 +00001602 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001603 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001604 goto onError;
1605
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001606 Py_XDECREF(errorHandler);
1607 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608 return (PyObject *)unicode;
1609
1610onError:
1611 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001612 Py_XDECREF(errorHandler);
1613 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001614 return NULL;
1615}
1616
Tim Peters772747b2001-08-09 22:21:55 +00001617PyObject *
1618PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1619 int size,
1620 const char *errors,
1621 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622{
1623 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001624 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001625#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001626 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001627#else
1628 const int pairs = 0;
1629#endif
Tim Peters772747b2001-08-09 22:21:55 +00001630 /* Offsets from p for storing byte pairs in the right order. */
1631#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1632 int ihi = 1, ilo = 0;
1633#else
1634 int ihi = 0, ilo = 1;
1635#endif
1636
1637#define STORECHAR(CH) \
1638 do { \
1639 p[ihi] = ((CH) >> 8) & 0xff; \
1640 p[ilo] = (CH) & 0xff; \
1641 p += 2; \
1642 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001643
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001644#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001645 for (i = pairs = 0; i < size; i++)
1646 if (s[i] >= 0x10000)
1647 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001648#endif
Tim Petersced69f82003-09-16 20:30:58 +00001649 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001650 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001651 if (v == NULL)
1652 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001653
Tim Peters772747b2001-08-09 22:21:55 +00001654 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001655 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001656 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001657 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001658 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001659
1660 if (byteorder == -1) {
1661 /* force LE */
1662 ihi = 1;
1663 ilo = 0;
1664 }
1665 else if (byteorder == 1) {
1666 /* force BE */
1667 ihi = 0;
1668 ilo = 1;
1669 }
1670
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001671 while (size-- > 0) {
1672 Py_UNICODE ch = *s++;
1673 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001674#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001675 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001676 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1677 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001678 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001679#endif
Tim Peters772747b2001-08-09 22:21:55 +00001680 STORECHAR(ch);
1681 if (ch2)
1682 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001683 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001685#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686}
1687
1688PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1689{
1690 if (!PyUnicode_Check(unicode)) {
1691 PyErr_BadArgument();
1692 return NULL;
1693 }
1694 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1695 PyUnicode_GET_SIZE(unicode),
1696 NULL,
1697 0);
1698}
1699
1700/* --- Unicode Escape Codec ----------------------------------------------- */
1701
Fredrik Lundh06d12682001-01-24 07:59:11 +00001702static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001703
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1705 int size,
1706 const char *errors)
1707{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001708 const char *starts = s;
1709 int startinpos;
1710 int endinpos;
1711 int outpos;
1712 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001714 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001716 char* message;
1717 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001718 PyObject *errorHandler = NULL;
1719 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001720
Guido van Rossumd57fd912000-03-10 22:53:23 +00001721 /* Escaped strings will always be longer than the resulting
1722 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001723 length after conversion to the true value.
1724 (but if the error callback returns a long replacement string
1725 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726 v = _PyUnicode_New(size);
1727 if (v == NULL)
1728 goto onError;
1729 if (size == 0)
1730 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001731
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001732 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001734
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735 while (s < end) {
1736 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001737 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001738 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739
1740 /* Non-escape characters are interpreted as Unicode ordinals */
1741 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001742 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001743 continue;
1744 }
1745
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747 /* \ - Escapes */
1748 s++;
1749 switch (*s++) {
1750
1751 /* \x escapes */
1752 case '\n': break;
1753 case '\\': *p++ = '\\'; break;
1754 case '\'': *p++ = '\''; break;
1755 case '\"': *p++ = '\"'; break;
1756 case 'b': *p++ = '\b'; break;
1757 case 'f': *p++ = '\014'; break; /* FF */
1758 case 't': *p++ = '\t'; break;
1759 case 'n': *p++ = '\n'; break;
1760 case 'r': *p++ = '\r'; break;
1761 case 'v': *p++ = '\013'; break; /* VT */
1762 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1763
1764 /* \OOO (octal) escapes */
1765 case '0': case '1': case '2': case '3':
1766 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001767 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001769 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001771 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001773 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774 break;
1775
Fredrik Lundhccc74732001-02-18 22:13:49 +00001776 /* hex escapes */
1777 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001779 digits = 2;
1780 message = "truncated \\xXX escape";
1781 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782
Fredrik Lundhccc74732001-02-18 22:13:49 +00001783 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001785 digits = 4;
1786 message = "truncated \\uXXXX escape";
1787 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788
Fredrik Lundhccc74732001-02-18 22:13:49 +00001789 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001790 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001791 digits = 8;
1792 message = "truncated \\UXXXXXXXX escape";
1793 hexescape:
1794 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001795 outpos = p-PyUnicode_AS_UNICODE(v);
1796 if (s+digits>end) {
1797 endinpos = size;
1798 if (unicode_decode_call_errorhandler(
1799 errors, &errorHandler,
1800 "unicodeescape", "end of string in escape sequence",
1801 starts, size, &startinpos, &endinpos, &exc, &s,
1802 (PyObject **)&v, &outpos, &p))
1803 goto onError;
1804 goto nextByte;
1805 }
1806 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001807 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001808 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001809 endinpos = (s+i+1)-starts;
1810 if (unicode_decode_call_errorhandler(
1811 errors, &errorHandler,
1812 "unicodeescape", message,
1813 starts, size, &startinpos, &endinpos, &exc, &s,
1814 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001815 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001816 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001817 }
1818 chr = (chr<<4) & ~0xF;
1819 if (c >= '0' && c <= '9')
1820 chr += c - '0';
1821 else if (c >= 'a' && c <= 'f')
1822 chr += 10 + c - 'a';
1823 else
1824 chr += 10 + c - 'A';
1825 }
1826 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001827 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001828 /* _decoding_error will have already written into the
1829 target buffer. */
1830 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001831 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001832 /* when we get here, chr is a 32-bit unicode character */
1833 if (chr <= 0xffff)
1834 /* UCS-2 character */
1835 *p++ = (Py_UNICODE) chr;
1836 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001837 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001838 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001839#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001840 *p++ = chr;
1841#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001842 chr -= 0x10000L;
1843 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001844 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001845#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001846 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001847 endinpos = s-starts;
1848 outpos = p-PyUnicode_AS_UNICODE(v);
1849 if (unicode_decode_call_errorhandler(
1850 errors, &errorHandler,
1851 "unicodeescape", "illegal Unicode character",
1852 starts, size, &startinpos, &endinpos, &exc, &s,
1853 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001854 goto onError;
1855 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001856 break;
1857
1858 /* \N{name} */
1859 case 'N':
1860 message = "malformed \\N character escape";
1861 if (ucnhash_CAPI == NULL) {
1862 /* load the unicode data module */
1863 PyObject *m, *v;
1864 m = PyImport_ImportModule("unicodedata");
1865 if (m == NULL)
1866 goto ucnhashError;
1867 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1868 Py_DECREF(m);
1869 if (v == NULL)
1870 goto ucnhashError;
1871 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1872 Py_DECREF(v);
1873 if (ucnhash_CAPI == NULL)
1874 goto ucnhashError;
1875 }
1876 if (*s == '{') {
1877 const char *start = s+1;
1878 /* look for the closing brace */
1879 while (*s != '}' && s < end)
1880 s++;
1881 if (s > start && s < end && *s == '}') {
1882 /* found a name. look it up in the unicode database */
1883 message = "unknown Unicode character name";
1884 s++;
1885 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1886 goto store;
1887 }
1888 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001889 endinpos = s-starts;
1890 outpos = p-PyUnicode_AS_UNICODE(v);
1891 if (unicode_decode_call_errorhandler(
1892 errors, &errorHandler,
1893 "unicodeescape", message,
1894 starts, size, &startinpos, &endinpos, &exc, &s,
1895 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001896 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001897 break;
1898
1899 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001900 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001901 message = "\\ at end of string";
1902 s--;
1903 endinpos = s-starts;
1904 outpos = p-PyUnicode_AS_UNICODE(v);
1905 if (unicode_decode_call_errorhandler(
1906 errors, &errorHandler,
1907 "unicodeescape", message,
1908 starts, size, &startinpos, &endinpos, &exc, &s,
1909 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001910 goto onError;
1911 }
1912 else {
1913 *p++ = '\\';
1914 *p++ = (unsigned char)s[-1];
1915 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001916 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001918 nextByte:
1919 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001921 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001922 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001923 Py_XDECREF(errorHandler);
1924 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001925 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001926
Fredrik Lundhccc74732001-02-18 22:13:49 +00001927ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001928 PyErr_SetString(
1929 PyExc_UnicodeError,
1930 "\\N escapes not supported (can't load unicodedata module)"
1931 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001932 Py_XDECREF(errorHandler);
1933 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001934 return NULL;
1935
Fredrik Lundhccc74732001-02-18 22:13:49 +00001936onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001938 Py_XDECREF(errorHandler);
1939 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940 return NULL;
1941}
1942
1943/* Return a Unicode-Escape string version of the Unicode object.
1944
1945 If quotes is true, the string is enclosed in u"" or u'' quotes as
1946 appropriate.
1947
1948*/
1949
Barry Warsaw51ac5802000-03-20 16:36:48 +00001950static const Py_UNICODE *findchar(const Py_UNICODE *s,
1951 int size,
1952 Py_UNICODE ch);
1953
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954static
1955PyObject *unicodeescape_string(const Py_UNICODE *s,
1956 int size,
1957 int quotes)
1958{
1959 PyObject *repr;
1960 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001961
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001962 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001963
1964 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1965 if (repr == NULL)
1966 return NULL;
1967
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001968 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969
1970 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001972 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973 !findchar(s, size, '"')) ? '"' : '\'';
1974 }
1975 while (size-- > 0) {
1976 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001977
Guido van Rossumd57fd912000-03-10 22:53:23 +00001978 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001979 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001980 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981 *p++ = '\\';
1982 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001983 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001984 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001985
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001986#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001987 /* Map 21-bit characters to '\U00xxxxxx' */
1988 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001989 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00001990
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001991 /* Resize the string if necessary */
1992 if (offset + 12 > PyString_GET_SIZE(repr)) {
1993 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001994 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001995 p = PyString_AS_STRING(repr) + offset;
1996 }
1997
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001998 *p++ = '\\';
1999 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002000 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2001 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2002 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2003 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2004 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2005 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2006 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002007 *p++ = hexdigit[ch & 0x0000000F];
2008 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002009 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002010#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002011 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2012 else if (ch >= 0xD800 && ch < 0xDC00) {
2013 Py_UNICODE ch2;
2014 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002015
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002016 ch2 = *s++;
2017 size--;
2018 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2019 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2020 *p++ = '\\';
2021 *p++ = 'U';
2022 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2023 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2024 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2025 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2026 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2027 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2028 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2029 *p++ = hexdigit[ucs & 0x0000000F];
2030 continue;
2031 }
2032 /* Fall through: isolated surrogates are copied as-is */
2033 s--;
2034 size++;
2035 }
2036
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002038 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039 *p++ = '\\';
2040 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002041 *p++ = hexdigit[(ch >> 12) & 0x000F];
2042 *p++ = hexdigit[(ch >> 8) & 0x000F];
2043 *p++ = hexdigit[(ch >> 4) & 0x000F];
2044 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002046
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002047 /* Map special whitespace to '\t', \n', '\r' */
2048 else if (ch == '\t') {
2049 *p++ = '\\';
2050 *p++ = 't';
2051 }
2052 else if (ch == '\n') {
2053 *p++ = '\\';
2054 *p++ = 'n';
2055 }
2056 else if (ch == '\r') {
2057 *p++ = '\\';
2058 *p++ = 'r';
2059 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002060
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002061 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002062 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002064 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002065 *p++ = hexdigit[(ch >> 4) & 0x000F];
2066 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002067 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002068
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 /* Copy everything else as-is */
2070 else
2071 *p++ = (char) ch;
2072 }
2073 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002074 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075
2076 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002077 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 return repr;
2079}
2080
2081PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2082 int size)
2083{
2084 return unicodeescape_string(s, size, 0);
2085}
2086
2087PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2088{
2089 if (!PyUnicode_Check(unicode)) {
2090 PyErr_BadArgument();
2091 return NULL;
2092 }
2093 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2094 PyUnicode_GET_SIZE(unicode));
2095}
2096
2097/* --- Raw Unicode Escape Codec ------------------------------------------- */
2098
2099PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2100 int size,
2101 const char *errors)
2102{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002103 const char *starts = s;
2104 int startinpos;
2105 int endinpos;
2106 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002108 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109 const char *end;
2110 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002111 PyObject *errorHandler = NULL;
2112 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002113
Guido van Rossumd57fd912000-03-10 22:53:23 +00002114 /* Escaped strings will always be longer than the resulting
2115 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002116 length after conversion to the true value. (But decoding error
2117 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118 v = _PyUnicode_New(size);
2119 if (v == NULL)
2120 goto onError;
2121 if (size == 0)
2122 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002123 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 end = s + size;
2125 while (s < end) {
2126 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002127 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002129 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130
2131 /* Non-escape characters are interpreted as Unicode ordinals */
2132 if (*s != '\\') {
2133 *p++ = (unsigned char)*s++;
2134 continue;
2135 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002136 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002137
2138 /* \u-escapes are only interpreted iff the number of leading
2139 backslashes if odd */
2140 bs = s;
2141 for (;s < end;) {
2142 if (*s != '\\')
2143 break;
2144 *p++ = (unsigned char)*s++;
2145 }
2146 if (((s - bs) & 1) == 0 ||
2147 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002148 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002149 continue;
2150 }
2151 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002152 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153 s++;
2154
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002155 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002156 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002157 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002158 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002160 endinpos = s-starts;
2161 if (unicode_decode_call_errorhandler(
2162 errors, &errorHandler,
2163 "rawunicodeescape", "truncated \\uXXXX",
2164 starts, size, &startinpos, &endinpos, &exc, &s,
2165 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002166 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002167 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168 }
2169 x = (x<<4) & ~0xF;
2170 if (c >= '0' && c <= '9')
2171 x += c - '0';
2172 else if (c >= 'a' && c <= 'f')
2173 x += 10 + c - 'a';
2174 else
2175 x += 10 + c - 'A';
2176 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002177#ifndef Py_UNICODE_WIDE
2178 if (x > 0x10000) {
2179 if (unicode_decode_call_errorhandler(
2180 errors, &errorHandler,
2181 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2182 starts, size, &startinpos, &endinpos, &exc, &s,
2183 (PyObject **)&v, &outpos, &p))
2184 goto onError;
2185 }
2186#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002187 *p++ = x;
2188 nextByte:
2189 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002190 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002191 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002192 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002193 Py_XDECREF(errorHandler);
2194 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002196
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 onError:
2198 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002199 Py_XDECREF(errorHandler);
2200 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201 return NULL;
2202}
2203
2204PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2205 int size)
2206{
2207 PyObject *repr;
2208 char *p;
2209 char *q;
2210
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002211 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002213#ifdef Py_UNICODE_WIDE
2214 repr = PyString_FromStringAndSize(NULL, 10 * size);
2215#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002217#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002218 if (repr == NULL)
2219 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002220 if (size == 0)
2221 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222
2223 p = q = PyString_AS_STRING(repr);
2224 while (size-- > 0) {
2225 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002226#ifdef Py_UNICODE_WIDE
2227 /* Map 32-bit characters to '\Uxxxxxxxx' */
2228 if (ch >= 0x10000) {
2229 *p++ = '\\';
2230 *p++ = 'U';
2231 *p++ = hexdigit[(ch >> 28) & 0xf];
2232 *p++ = hexdigit[(ch >> 24) & 0xf];
2233 *p++ = hexdigit[(ch >> 20) & 0xf];
2234 *p++ = hexdigit[(ch >> 16) & 0xf];
2235 *p++ = hexdigit[(ch >> 12) & 0xf];
2236 *p++ = hexdigit[(ch >> 8) & 0xf];
2237 *p++ = hexdigit[(ch >> 4) & 0xf];
2238 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002239 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002240 else
2241#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002242 /* Map 16-bit characters to '\uxxxx' */
2243 if (ch >= 256) {
2244 *p++ = '\\';
2245 *p++ = 'u';
2246 *p++ = hexdigit[(ch >> 12) & 0xf];
2247 *p++ = hexdigit[(ch >> 8) & 0xf];
2248 *p++ = hexdigit[(ch >> 4) & 0xf];
2249 *p++ = hexdigit[ch & 15];
2250 }
2251 /* Copy everything else as-is */
2252 else
2253 *p++ = (char) ch;
2254 }
2255 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002256 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002257 return repr;
2258}
2259
2260PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2261{
2262 if (!PyUnicode_Check(unicode)) {
2263 PyErr_BadArgument();
2264 return NULL;
2265 }
2266 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2267 PyUnicode_GET_SIZE(unicode));
2268}
2269
2270/* --- Latin-1 Codec ------------------------------------------------------ */
2271
2272PyObject *PyUnicode_DecodeLatin1(const char *s,
2273 int size,
2274 const char *errors)
2275{
2276 PyUnicodeObject *v;
2277 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002278
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002280 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002281 Py_UNICODE r = *(unsigned char*)s;
2282 return PyUnicode_FromUnicode(&r, 1);
2283 }
2284
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285 v = _PyUnicode_New(size);
2286 if (v == NULL)
2287 goto onError;
2288 if (size == 0)
2289 return (PyObject *)v;
2290 p = PyUnicode_AS_UNICODE(v);
2291 while (size-- > 0)
2292 *p++ = (unsigned char)*s++;
2293 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002294
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295 onError:
2296 Py_XDECREF(v);
2297 return NULL;
2298}
2299
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002300/* create or adjust a UnicodeEncodeError */
2301static void make_encode_exception(PyObject **exceptionObject,
2302 const char *encoding,
2303 const Py_UNICODE *unicode, int size,
2304 int startpos, int endpos,
2305 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002307 if (*exceptionObject == NULL) {
2308 *exceptionObject = PyUnicodeEncodeError_Create(
2309 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310 }
2311 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002312 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2313 goto onError;
2314 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2315 goto onError;
2316 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2317 goto onError;
2318 return;
2319 onError:
2320 Py_DECREF(*exceptionObject);
2321 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322 }
2323}
2324
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002325/* raises a UnicodeEncodeError */
2326static void raise_encode_exception(PyObject **exceptionObject,
2327 const char *encoding,
2328 const Py_UNICODE *unicode, int size,
2329 int startpos, int endpos,
2330 const char *reason)
2331{
2332 make_encode_exception(exceptionObject,
2333 encoding, unicode, size, startpos, endpos, reason);
2334 if (*exceptionObject != NULL)
2335 PyCodec_StrictErrors(*exceptionObject);
2336}
2337
2338/* error handling callback helper:
2339 build arguments, call the callback and check the arguments,
2340 put the result into newpos and return the replacement string, which
2341 has to be freed by the caller */
2342static PyObject *unicode_encode_call_errorhandler(const char *errors,
2343 PyObject **errorHandler,
2344 const char *encoding, const char *reason,
2345 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2346 int startpos, int endpos,
2347 int *newpos)
2348{
2349 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2350
2351 PyObject *restuple;
2352 PyObject *resunicode;
2353
2354 if (*errorHandler == NULL) {
2355 *errorHandler = PyCodec_LookupError(errors);
2356 if (*errorHandler == NULL)
2357 return NULL;
2358 }
2359
2360 make_encode_exception(exceptionObject,
2361 encoding, unicode, size, startpos, endpos, reason);
2362 if (*exceptionObject == NULL)
2363 return NULL;
2364
2365 restuple = PyObject_CallFunctionObjArgs(
2366 *errorHandler, *exceptionObject, NULL);
2367 if (restuple == NULL)
2368 return NULL;
2369 if (!PyTuple_Check(restuple)) {
2370 PyErr_Format(PyExc_TypeError, &argparse[4]);
2371 Py_DECREF(restuple);
2372 return NULL;
2373 }
2374 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2375 &resunicode, newpos)) {
2376 Py_DECREF(restuple);
2377 return NULL;
2378 }
2379 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002380 *newpos = size+*newpos;
2381 if (*newpos<0 || *newpos>size) {
2382 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2383 Py_DECREF(restuple);
2384 return NULL;
2385 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002386 Py_INCREF(resunicode);
2387 Py_DECREF(restuple);
2388 return resunicode;
2389}
2390
2391static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2392 int size,
2393 const char *errors,
2394 int limit)
2395{
2396 /* output object */
2397 PyObject *res;
2398 /* pointers to the beginning and end+1 of input */
2399 const Py_UNICODE *startp = p;
2400 const Py_UNICODE *endp = p + size;
2401 /* pointer to the beginning of the unencodable characters */
2402 /* const Py_UNICODE *badp = NULL; */
2403 /* pointer into the output */
2404 char *str;
2405 /* current output position */
2406 int respos = 0;
2407 int ressize;
2408 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2409 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2410 PyObject *errorHandler = NULL;
2411 PyObject *exc = NULL;
2412 /* the following variable is used for caching string comparisons
2413 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2414 int known_errorHandler = -1;
2415
2416 /* allocate enough for a simple encoding without
2417 replacements, if we need more, we'll resize */
2418 res = PyString_FromStringAndSize(NULL, size);
2419 if (res == NULL)
2420 goto onError;
2421 if (size == 0)
2422 return res;
2423 str = PyString_AS_STRING(res);
2424 ressize = size;
2425
2426 while (p<endp) {
2427 Py_UNICODE c = *p;
2428
2429 /* can we encode this? */
2430 if (c<limit) {
2431 /* no overflow check, because we know that the space is enough */
2432 *str++ = (char)c;
2433 ++p;
2434 }
2435 else {
2436 int unicodepos = p-startp;
2437 int requiredsize;
2438 PyObject *repunicode;
2439 int repsize;
2440 int newpos;
2441 int respos;
2442 Py_UNICODE *uni2;
2443 /* startpos for collecting unencodable chars */
2444 const Py_UNICODE *collstart = p;
2445 const Py_UNICODE *collend = p;
2446 /* find all unecodable characters */
2447 while ((collend < endp) && ((*collend)>=limit))
2448 ++collend;
2449 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2450 if (known_errorHandler==-1) {
2451 if ((errors==NULL) || (!strcmp(errors, "strict")))
2452 known_errorHandler = 1;
2453 else if (!strcmp(errors, "replace"))
2454 known_errorHandler = 2;
2455 else if (!strcmp(errors, "ignore"))
2456 known_errorHandler = 3;
2457 else if (!strcmp(errors, "xmlcharrefreplace"))
2458 known_errorHandler = 4;
2459 else
2460 known_errorHandler = 0;
2461 }
2462 switch (known_errorHandler) {
2463 case 1: /* strict */
2464 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2465 goto onError;
2466 case 2: /* replace */
2467 while (collstart++<collend)
2468 *str++ = '?'; /* fall through */
2469 case 3: /* ignore */
2470 p = collend;
2471 break;
2472 case 4: /* xmlcharrefreplace */
2473 respos = str-PyString_AS_STRING(res);
2474 /* determine replacement size (temporarily (mis)uses p) */
2475 for (p = collstart, repsize = 0; p < collend; ++p) {
2476 if (*p<10)
2477 repsize += 2+1+1;
2478 else if (*p<100)
2479 repsize += 2+2+1;
2480 else if (*p<1000)
2481 repsize += 2+3+1;
2482 else if (*p<10000)
2483 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002484#ifndef Py_UNICODE_WIDE
2485 else
2486 repsize += 2+5+1;
2487#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002488 else if (*p<100000)
2489 repsize += 2+5+1;
2490 else if (*p<1000000)
2491 repsize += 2+6+1;
2492 else
2493 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002494#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002495 }
2496 requiredsize = respos+repsize+(endp-collend);
2497 if (requiredsize > ressize) {
2498 if (requiredsize<2*ressize)
2499 requiredsize = 2*ressize;
2500 if (_PyString_Resize(&res, requiredsize))
2501 goto onError;
2502 str = PyString_AS_STRING(res) + respos;
2503 ressize = requiredsize;
2504 }
2505 /* generate replacement (temporarily (mis)uses p) */
2506 for (p = collstart; p < collend; ++p) {
2507 str += sprintf(str, "&#%d;", (int)*p);
2508 }
2509 p = collend;
2510 break;
2511 default:
2512 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2513 encoding, reason, startp, size, &exc,
2514 collstart-startp, collend-startp, &newpos);
2515 if (repunicode == NULL)
2516 goto onError;
2517 /* need more space? (at least enough for what we
2518 have+the replacement+the rest of the string, so
2519 we won't have to check space for encodable characters) */
2520 respos = str-PyString_AS_STRING(res);
2521 repsize = PyUnicode_GET_SIZE(repunicode);
2522 requiredsize = respos+repsize+(endp-collend);
2523 if (requiredsize > ressize) {
2524 if (requiredsize<2*ressize)
2525 requiredsize = 2*ressize;
2526 if (_PyString_Resize(&res, requiredsize)) {
2527 Py_DECREF(repunicode);
2528 goto onError;
2529 }
2530 str = PyString_AS_STRING(res) + respos;
2531 ressize = requiredsize;
2532 }
2533 /* check if there is anything unencodable in the replacement
2534 and copy it to the output */
2535 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2536 c = *uni2;
2537 if (c >= limit) {
2538 raise_encode_exception(&exc, encoding, startp, size,
2539 unicodepos, unicodepos+1, reason);
2540 Py_DECREF(repunicode);
2541 goto onError;
2542 }
2543 *str = (char)c;
2544 }
2545 p = startp + newpos;
2546 Py_DECREF(repunicode);
2547 }
2548 }
2549 }
2550 /* Resize if we allocated to much */
2551 respos = str-PyString_AS_STRING(res);
2552 if (respos<ressize)
2553 /* If this falls res will be NULL */
2554 _PyString_Resize(&res, respos);
2555 Py_XDECREF(errorHandler);
2556 Py_XDECREF(exc);
2557 return res;
2558
2559 onError:
2560 Py_XDECREF(res);
2561 Py_XDECREF(errorHandler);
2562 Py_XDECREF(exc);
2563 return NULL;
2564}
2565
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2567 int size,
2568 const char *errors)
2569{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002570 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002571}
2572
2573PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2574{
2575 if (!PyUnicode_Check(unicode)) {
2576 PyErr_BadArgument();
2577 return NULL;
2578 }
2579 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2580 PyUnicode_GET_SIZE(unicode),
2581 NULL);
2582}
2583
2584/* --- 7-bit ASCII Codec -------------------------------------------------- */
2585
Guido van Rossumd57fd912000-03-10 22:53:23 +00002586PyObject *PyUnicode_DecodeASCII(const char *s,
2587 int size,
2588 const char *errors)
2589{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002590 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591 PyUnicodeObject *v;
2592 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002593 int startinpos;
2594 int endinpos;
2595 int outpos;
2596 const char *e;
2597 PyObject *errorHandler = NULL;
2598 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002599
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002601 if (size == 1 && *(unsigned char*)s < 128) {
2602 Py_UNICODE r = *(unsigned char*)s;
2603 return PyUnicode_FromUnicode(&r, 1);
2604 }
Tim Petersced69f82003-09-16 20:30:58 +00002605
Guido van Rossumd57fd912000-03-10 22:53:23 +00002606 v = _PyUnicode_New(size);
2607 if (v == NULL)
2608 goto onError;
2609 if (size == 0)
2610 return (PyObject *)v;
2611 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002612 e = s + size;
2613 while (s < e) {
2614 register unsigned char c = (unsigned char)*s;
2615 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002617 ++s;
2618 }
2619 else {
2620 startinpos = s-starts;
2621 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002622 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002623 if (unicode_decode_call_errorhandler(
2624 errors, &errorHandler,
2625 "ascii", "ordinal not in range(128)",
2626 starts, size, &startinpos, &endinpos, &exc, &s,
2627 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002629 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002630 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002631 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002632 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002633 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002634 Py_XDECREF(errorHandler);
2635 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002636 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002637
Guido van Rossumd57fd912000-03-10 22:53:23 +00002638 onError:
2639 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002640 Py_XDECREF(errorHandler);
2641 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002642 return NULL;
2643}
2644
Guido van Rossumd57fd912000-03-10 22:53:23 +00002645PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2646 int size,
2647 const char *errors)
2648{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002649 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002650}
2651
2652PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2653{
2654 if (!PyUnicode_Check(unicode)) {
2655 PyErr_BadArgument();
2656 return NULL;
2657 }
2658 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2659 PyUnicode_GET_SIZE(unicode),
2660 NULL);
2661}
2662
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002663#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002664
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002665/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002666
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002667PyObject *PyUnicode_DecodeMBCS(const char *s,
2668 int size,
2669 const char *errors)
2670{
2671 PyUnicodeObject *v;
2672 Py_UNICODE *p;
2673
2674 /* First get the size of the result */
2675 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002676 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002677 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2678
2679 v = _PyUnicode_New(usize);
2680 if (v == NULL)
2681 return NULL;
2682 if (usize == 0)
2683 return (PyObject *)v;
2684 p = PyUnicode_AS_UNICODE(v);
2685 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2686 Py_DECREF(v);
2687 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2688 }
2689
2690 return (PyObject *)v;
2691}
2692
2693PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2694 int size,
2695 const char *errors)
2696{
2697 PyObject *repr;
2698 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002699 DWORD mbcssize;
2700
2701 /* If there are no characters, bail now! */
2702 if (size==0)
2703 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002704
2705 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002706 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002707 if (mbcssize==0)
2708 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2709
2710 repr = PyString_FromStringAndSize(NULL, mbcssize);
2711 if (repr == NULL)
2712 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002713 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002714 return repr;
2715
2716 /* Do the conversion */
2717 s = PyString_AS_STRING(repr);
2718 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2719 Py_DECREF(repr);
2720 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2721 }
2722 return repr;
2723}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002724
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002725PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2726{
2727 if (!PyUnicode_Check(unicode)) {
2728 PyErr_BadArgument();
2729 return NULL;
2730 }
2731 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2732 PyUnicode_GET_SIZE(unicode),
2733 NULL);
2734}
2735
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002736#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002737
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738/* --- Character Mapping Codec -------------------------------------------- */
2739
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740PyObject *PyUnicode_DecodeCharmap(const char *s,
2741 int size,
2742 PyObject *mapping,
2743 const char *errors)
2744{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002745 const char *starts = s;
2746 int startinpos;
2747 int endinpos;
2748 int outpos;
2749 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750 PyUnicodeObject *v;
2751 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002752 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002753 PyObject *errorHandler = NULL;
2754 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002755
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 /* Default to Latin-1 */
2757 if (mapping == NULL)
2758 return PyUnicode_DecodeLatin1(s, size, errors);
2759
2760 v = _PyUnicode_New(size);
2761 if (v == NULL)
2762 goto onError;
2763 if (size == 0)
2764 return (PyObject *)v;
2765 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002766 e = s + size;
2767 while (s < e) {
2768 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002769 PyObject *w, *x;
2770
2771 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2772 w = PyInt_FromLong((long)ch);
2773 if (w == NULL)
2774 goto onError;
2775 x = PyObject_GetItem(mapping, w);
2776 Py_DECREF(w);
2777 if (x == NULL) {
2778 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002779 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002781 x = Py_None;
2782 Py_INCREF(x);
2783 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002784 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785 }
2786
2787 /* Apply mapping */
2788 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002789 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790 if (value < 0 || value > 65535) {
2791 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002792 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002793 Py_DECREF(x);
2794 goto onError;
2795 }
2796 *p++ = (Py_UNICODE)value;
2797 }
2798 else if (x == Py_None) {
2799 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002800 outpos = p-PyUnicode_AS_UNICODE(v);
2801 startinpos = s-starts;
2802 endinpos = startinpos+1;
2803 if (unicode_decode_call_errorhandler(
2804 errors, &errorHandler,
2805 "charmap", "character maps to <undefined>",
2806 starts, size, &startinpos, &endinpos, &exc, &s,
2807 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 Py_DECREF(x);
2809 goto onError;
2810 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002811 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812 }
2813 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002814 int targetsize = PyUnicode_GET_SIZE(x);
2815
2816 if (targetsize == 1)
2817 /* 1-1 mapping */
2818 *p++ = *PyUnicode_AS_UNICODE(x);
2819
2820 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002822 if (targetsize > extrachars) {
2823 /* resize first */
2824 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2825 int needed = (targetsize - extrachars) + \
2826 (targetsize << 2);
2827 extrachars += needed;
Tim Petersced69f82003-09-16 20:30:58 +00002828 if (_PyUnicode_Resize(&v,
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002829 PyUnicode_GET_SIZE(v) + needed) < 0) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002830 Py_DECREF(x);
2831 goto onError;
2832 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002833 p = PyUnicode_AS_UNICODE(v) + oldpos;
2834 }
2835 Py_UNICODE_COPY(p,
2836 PyUnicode_AS_UNICODE(x),
2837 targetsize);
2838 p += targetsize;
2839 extrachars -= targetsize;
2840 }
2841 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002842 }
2843 else {
2844 /* wrong return value */
2845 PyErr_SetString(PyExc_TypeError,
2846 "character mapping must return integer, None or unicode");
2847 Py_DECREF(x);
2848 goto onError;
2849 }
2850 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002851 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852 }
2853 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002854 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002856 Py_XDECREF(errorHandler);
2857 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002858 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002859
Guido van Rossumd57fd912000-03-10 22:53:23 +00002860 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002861 Py_XDECREF(errorHandler);
2862 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002863 Py_XDECREF(v);
2864 return NULL;
2865}
2866
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002867/* Lookup the character ch in the mapping. If the character
2868 can't be found, Py_None is returned (or NULL, if another
2869 error occured). */
2870static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002871{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002872 PyObject *w = PyInt_FromLong((long)c);
2873 PyObject *x;
2874
2875 if (w == NULL)
2876 return NULL;
2877 x = PyObject_GetItem(mapping, w);
2878 Py_DECREF(w);
2879 if (x == NULL) {
2880 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2881 /* No mapping found means: mapping is undefined. */
2882 PyErr_Clear();
2883 x = Py_None;
2884 Py_INCREF(x);
2885 return x;
2886 } else
2887 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002889 else if (x == Py_None)
2890 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002891 else if (PyInt_Check(x)) {
2892 long value = PyInt_AS_LONG(x);
2893 if (value < 0 || value > 255) {
2894 PyErr_SetString(PyExc_TypeError,
2895 "character mapping must be in range(256)");
2896 Py_DECREF(x);
2897 return NULL;
2898 }
2899 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002900 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002901 else if (PyString_Check(x))
2902 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002904 /* wrong return value */
2905 PyErr_SetString(PyExc_TypeError,
2906 "character mapping must return integer, None or str");
2907 Py_DECREF(x);
2908 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909 }
2910}
2911
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002912/* lookup the character, put the result in the output string and adjust
2913 various state variables. Reallocate the output string if not enough
2914 space is available. Return a new reference to the object that
2915 was put in the output buffer, or Py_None, if the mapping was undefined
2916 (in which case no character was written) or NULL, if a
2917 reallocation error ocurred. The called must decref the result */
2918static
2919PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2920 PyObject **outobj, int *outpos)
2921{
2922 PyObject *rep = charmapencode_lookup(c, mapping);
2923
2924 if (rep==NULL)
2925 return NULL;
2926 else if (rep==Py_None)
2927 return rep;
2928 else {
2929 char *outstart = PyString_AS_STRING(*outobj);
2930 int outsize = PyString_GET_SIZE(*outobj);
2931 if (PyInt_Check(rep)) {
2932 int requiredsize = *outpos+1;
2933 if (outsize<requiredsize) {
2934 /* exponentially overallocate to minimize reallocations */
2935 if (requiredsize < 2*outsize)
2936 requiredsize = 2*outsize;
2937 if (_PyString_Resize(outobj, requiredsize)) {
2938 Py_DECREF(rep);
2939 return NULL;
2940 }
2941 outstart = PyString_AS_STRING(*outobj);
2942 }
2943 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2944 }
2945 else {
2946 const char *repchars = PyString_AS_STRING(rep);
2947 int repsize = PyString_GET_SIZE(rep);
2948 int requiredsize = *outpos+repsize;
2949 if (outsize<requiredsize) {
2950 /* exponentially overallocate to minimize reallocations */
2951 if (requiredsize < 2*outsize)
2952 requiredsize = 2*outsize;
2953 if (_PyString_Resize(outobj, requiredsize)) {
2954 Py_DECREF(rep);
2955 return NULL;
2956 }
2957 outstart = PyString_AS_STRING(*outobj);
2958 }
2959 memcpy(outstart + *outpos, repchars, repsize);
2960 *outpos += repsize;
2961 }
2962 }
2963 return rep;
2964}
2965
2966/* handle an error in PyUnicode_EncodeCharmap
2967 Return 0 on success, -1 on error */
2968static
2969int charmap_encoding_error(
2970 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2971 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002972 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002973 PyObject **res, int *respos)
2974{
2975 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2976 int repsize;
2977 int newpos;
2978 Py_UNICODE *uni2;
2979 /* startpos for collecting unencodable chars */
2980 int collstartpos = *inpos;
2981 int collendpos = *inpos+1;
2982 int collpos;
2983 char *encoding = "charmap";
2984 char *reason = "character maps to <undefined>";
2985
2986 PyObject *x;
2987 /* find all unencodable characters */
2988 while (collendpos < size) {
2989 x = charmapencode_lookup(p[collendpos], mapping);
2990 if (x==NULL)
2991 return -1;
2992 else if (x!=Py_None) {
2993 Py_DECREF(x);
2994 break;
2995 }
2996 Py_DECREF(x);
2997 ++collendpos;
2998 }
2999 /* cache callback name lookup
3000 * (if not done yet, i.e. it's the first error) */
3001 if (*known_errorHandler==-1) {
3002 if ((errors==NULL) || (!strcmp(errors, "strict")))
3003 *known_errorHandler = 1;
3004 else if (!strcmp(errors, "replace"))
3005 *known_errorHandler = 2;
3006 else if (!strcmp(errors, "ignore"))
3007 *known_errorHandler = 3;
3008 else if (!strcmp(errors, "xmlcharrefreplace"))
3009 *known_errorHandler = 4;
3010 else
3011 *known_errorHandler = 0;
3012 }
3013 switch (*known_errorHandler) {
3014 case 1: /* strict */
3015 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3016 return -1;
3017 case 2: /* replace */
3018 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3019 x = charmapencode_output('?', mapping, res, respos);
3020 if (x==NULL) {
3021 return -1;
3022 }
3023 else if (x==Py_None) {
3024 Py_DECREF(x);
3025 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3026 return -1;
3027 }
3028 Py_DECREF(x);
3029 }
3030 /* fall through */
3031 case 3: /* ignore */
3032 *inpos = collendpos;
3033 break;
3034 case 4: /* xmlcharrefreplace */
3035 /* generate replacement (temporarily (mis)uses p) */
3036 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3037 char buffer[2+29+1+1];
3038 char *cp;
3039 sprintf(buffer, "&#%d;", (int)p[collpos]);
3040 for (cp = buffer; *cp; ++cp) {
3041 x = charmapencode_output(*cp, mapping, res, respos);
3042 if (x==NULL)
3043 return -1;
3044 else if (x==Py_None) {
3045 Py_DECREF(x);
3046 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3047 return -1;
3048 }
3049 Py_DECREF(x);
3050 }
3051 }
3052 *inpos = collendpos;
3053 break;
3054 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003055 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003056 encoding, reason, p, size, exceptionObject,
3057 collstartpos, collendpos, &newpos);
3058 if (repunicode == NULL)
3059 return -1;
3060 /* generate replacement */
3061 repsize = PyUnicode_GET_SIZE(repunicode);
3062 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3063 x = charmapencode_output(*uni2, mapping, res, respos);
3064 if (x==NULL) {
3065 Py_DECREF(repunicode);
3066 return -1;
3067 }
3068 else if (x==Py_None) {
3069 Py_DECREF(repunicode);
3070 Py_DECREF(x);
3071 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3072 return -1;
3073 }
3074 Py_DECREF(x);
3075 }
3076 *inpos = newpos;
3077 Py_DECREF(repunicode);
3078 }
3079 return 0;
3080}
3081
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3083 int size,
3084 PyObject *mapping,
3085 const char *errors)
3086{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003087 /* output object */
3088 PyObject *res = NULL;
3089 /* current input position */
3090 int inpos = 0;
3091 /* current output position */
3092 int respos = 0;
3093 PyObject *errorHandler = NULL;
3094 PyObject *exc = NULL;
3095 /* the following variable is used for caching string comparisons
3096 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3097 * 3=ignore, 4=xmlcharrefreplace */
3098 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003099
3100 /* Default to Latin-1 */
3101 if (mapping == NULL)
3102 return PyUnicode_EncodeLatin1(p, size, errors);
3103
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003104 /* allocate enough for a simple encoding without
3105 replacements, if we need more, we'll resize */
3106 res = PyString_FromStringAndSize(NULL, size);
3107 if (res == NULL)
3108 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003109 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003110 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003112 while (inpos<size) {
3113 /* try to encode it */
3114 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3115 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003117 if (x==Py_None) { /* unencodable character */
3118 if (charmap_encoding_error(p, size, &inpos, mapping,
3119 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003120 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003121 &res, &respos)) {
3122 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003123 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003124 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003126 else
3127 /* done with this character => adjust input position */
3128 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003129 Py_DECREF(x);
3130 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003132 /* Resize if we allocated to much */
3133 if (respos<PyString_GET_SIZE(res)) {
3134 if (_PyString_Resize(&res, respos))
3135 goto onError;
3136 }
3137 Py_XDECREF(exc);
3138 Py_XDECREF(errorHandler);
3139 return res;
3140
3141 onError:
3142 Py_XDECREF(res);
3143 Py_XDECREF(exc);
3144 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003145 return NULL;
3146}
3147
3148PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3149 PyObject *mapping)
3150{
3151 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3152 PyErr_BadArgument();
3153 return NULL;
3154 }
3155 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3156 PyUnicode_GET_SIZE(unicode),
3157 mapping,
3158 NULL);
3159}
3160
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003161/* create or adjust a UnicodeTranslateError */
3162static void make_translate_exception(PyObject **exceptionObject,
3163 const Py_UNICODE *unicode, int size,
3164 int startpos, int endpos,
3165 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003167 if (*exceptionObject == NULL) {
3168 *exceptionObject = PyUnicodeTranslateError_Create(
3169 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170 }
3171 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003172 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3173 goto onError;
3174 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3175 goto onError;
3176 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3177 goto onError;
3178 return;
3179 onError:
3180 Py_DECREF(*exceptionObject);
3181 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182 }
3183}
3184
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003185/* raises a UnicodeTranslateError */
3186static void raise_translate_exception(PyObject **exceptionObject,
3187 const Py_UNICODE *unicode, int size,
3188 int startpos, int endpos,
3189 const char *reason)
3190{
3191 make_translate_exception(exceptionObject,
3192 unicode, size, startpos, endpos, reason);
3193 if (*exceptionObject != NULL)
3194 PyCodec_StrictErrors(*exceptionObject);
3195}
3196
3197/* error handling callback helper:
3198 build arguments, call the callback and check the arguments,
3199 put the result into newpos and return the replacement string, which
3200 has to be freed by the caller */
3201static PyObject *unicode_translate_call_errorhandler(const char *errors,
3202 PyObject **errorHandler,
3203 const char *reason,
3204 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3205 int startpos, int endpos,
3206 int *newpos)
3207{
3208 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3209
3210 PyObject *restuple;
3211 PyObject *resunicode;
3212
3213 if (*errorHandler == NULL) {
3214 *errorHandler = PyCodec_LookupError(errors);
3215 if (*errorHandler == NULL)
3216 return NULL;
3217 }
3218
3219 make_translate_exception(exceptionObject,
3220 unicode, size, startpos, endpos, reason);
3221 if (*exceptionObject == NULL)
3222 return NULL;
3223
3224 restuple = PyObject_CallFunctionObjArgs(
3225 *errorHandler, *exceptionObject, NULL);
3226 if (restuple == NULL)
3227 return NULL;
3228 if (!PyTuple_Check(restuple)) {
3229 PyErr_Format(PyExc_TypeError, &argparse[4]);
3230 Py_DECREF(restuple);
3231 return NULL;
3232 }
3233 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3234 &resunicode, newpos)) {
3235 Py_DECREF(restuple);
3236 return NULL;
3237 }
3238 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003239 *newpos = size+*newpos;
3240 if (*newpos<0 || *newpos>size) {
3241 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3242 Py_DECREF(restuple);
3243 return NULL;
3244 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003245 Py_INCREF(resunicode);
3246 Py_DECREF(restuple);
3247 return resunicode;
3248}
3249
3250/* Lookup the character ch in the mapping and put the result in result,
3251 which must be decrefed by the caller.
3252 Return 0 on success, -1 on error */
3253static
3254int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3255{
3256 PyObject *w = PyInt_FromLong((long)c);
3257 PyObject *x;
3258
3259 if (w == NULL)
3260 return -1;
3261 x = PyObject_GetItem(mapping, w);
3262 Py_DECREF(w);
3263 if (x == NULL) {
3264 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3265 /* No mapping found means: use 1:1 mapping. */
3266 PyErr_Clear();
3267 *result = NULL;
3268 return 0;
3269 } else
3270 return -1;
3271 }
3272 else if (x == Py_None) {
3273 *result = x;
3274 return 0;
3275 }
3276 else if (PyInt_Check(x)) {
3277 long value = PyInt_AS_LONG(x);
3278 long max = PyUnicode_GetMax();
3279 if (value < 0 || value > max) {
3280 PyErr_Format(PyExc_TypeError,
3281 "character mapping must be in range(0x%lx)", max+1);
3282 Py_DECREF(x);
3283 return -1;
3284 }
3285 *result = x;
3286 return 0;
3287 }
3288 else if (PyUnicode_Check(x)) {
3289 *result = x;
3290 return 0;
3291 }
3292 else {
3293 /* wrong return value */
3294 PyErr_SetString(PyExc_TypeError,
3295 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003296 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003297 return -1;
3298 }
3299}
3300/* ensure that *outobj is at least requiredsize characters long,
3301if not reallocate and adjust various state variables.
3302Return 0 on success, -1 on error */
3303static
Walter Dörwald4894c302003-10-24 14:25:28 +00003304int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003305 int requiredsize)
3306{
Walter Dörwald4894c302003-10-24 14:25:28 +00003307 int oldsize = PyUnicode_GET_SIZE(*outobj);
3308 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003309 /* remember old output position */
3310 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3311 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003312 if (requiredsize < 2 * oldsize)
3313 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003314 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003315 return -1;
3316 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003317 }
3318 return 0;
3319}
3320/* lookup the character, put the result in the output string and adjust
3321 various state variables. Return a new reference to the object that
3322 was put in the output buffer in *result, or Py_None, if the mapping was
3323 undefined (in which case no character was written).
3324 The called must decref result.
3325 Return 0 on success, -1 on error. */
3326static
Walter Dörwald4894c302003-10-24 14:25:28 +00003327int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3328 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3329 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330{
Walter Dörwald4894c302003-10-24 14:25:28 +00003331 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003332 return -1;
3333 if (*res==NULL) {
3334 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003335 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003336 }
3337 else if (*res==Py_None)
3338 ;
3339 else if (PyInt_Check(*res)) {
3340 /* no overflow check, because we know that the space is enough */
3341 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3342 }
3343 else if (PyUnicode_Check(*res)) {
3344 int repsize = PyUnicode_GET_SIZE(*res);
3345 if (repsize==1) {
3346 /* no overflow check, because we know that the space is enough */
3347 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3348 }
3349 else if (repsize!=0) {
3350 /* more than one character */
Walter Dörwald4894c302003-10-24 14:25:28 +00003351 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003352 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003353 repsize - 1;
3354 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003355 return -1;
3356 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3357 *outp += repsize;
3358 }
3359 }
3360 else
3361 return -1;
3362 return 0;
3363}
3364
3365PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 int size,
3367 PyObject *mapping,
3368 const char *errors)
3369{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003370 /* output object */
3371 PyObject *res = NULL;
3372 /* pointers to the beginning and end+1 of input */
3373 const Py_UNICODE *startp = p;
3374 const Py_UNICODE *endp = p + size;
3375 /* pointer into the output */
3376 Py_UNICODE *str;
3377 /* current output position */
3378 int respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003379 char *reason = "character maps to <undefined>";
3380 PyObject *errorHandler = NULL;
3381 PyObject *exc = NULL;
3382 /* the following variable is used for caching string comparisons
3383 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3384 * 3=ignore, 4=xmlcharrefreplace */
3385 int known_errorHandler = -1;
3386
Guido van Rossumd57fd912000-03-10 22:53:23 +00003387 if (mapping == NULL) {
3388 PyErr_BadArgument();
3389 return NULL;
3390 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003391
3392 /* allocate enough for a simple 1:1 translation without
3393 replacements, if we need more, we'll resize */
3394 res = PyUnicode_FromUnicode(NULL, size);
3395 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003396 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003398 return res;
3399 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003400
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003401 while (p<endp) {
3402 /* try to encode it */
3403 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003404 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003405 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003406 goto onError;
3407 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003408 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003409 if (x!=Py_None) /* it worked => adjust input pointer */
3410 ++p;
3411 else { /* untranslatable character */
3412 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3413 int repsize;
3414 int newpos;
3415 Py_UNICODE *uni2;
3416 /* startpos for collecting untranslatable chars */
3417 const Py_UNICODE *collstart = p;
3418 const Py_UNICODE *collend = p+1;
3419 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003421 /* find all untranslatable characters */
3422 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003423 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003424 goto onError;
3425 Py_XDECREF(x);
3426 if (x!=Py_None)
3427 break;
3428 ++collend;
3429 }
3430 /* cache callback name lookup
3431 * (if not done yet, i.e. it's the first error) */
3432 if (known_errorHandler==-1) {
3433 if ((errors==NULL) || (!strcmp(errors, "strict")))
3434 known_errorHandler = 1;
3435 else if (!strcmp(errors, "replace"))
3436 known_errorHandler = 2;
3437 else if (!strcmp(errors, "ignore"))
3438 known_errorHandler = 3;
3439 else if (!strcmp(errors, "xmlcharrefreplace"))
3440 known_errorHandler = 4;
3441 else
3442 known_errorHandler = 0;
3443 }
3444 switch (known_errorHandler) {
3445 case 1: /* strict */
3446 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3447 goto onError;
3448 case 2: /* replace */
3449 /* No need to check for space, this is a 1:1 replacement */
3450 for (coll = collstart; coll<collend; ++coll)
3451 *str++ = '?';
3452 /* fall through */
3453 case 3: /* ignore */
3454 p = collend;
3455 break;
3456 case 4: /* xmlcharrefreplace */
3457 /* generate replacement (temporarily (mis)uses p) */
3458 for (p = collstart; p < collend; ++p) {
3459 char buffer[2+29+1+1];
3460 char *cp;
3461 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003462 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003463 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3464 goto onError;
3465 for (cp = buffer; *cp; ++cp)
3466 *str++ = *cp;
3467 }
3468 p = collend;
3469 break;
3470 default:
3471 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3472 reason, startp, size, &exc,
3473 collstart-startp, collend-startp, &newpos);
3474 if (repunicode == NULL)
3475 goto onError;
3476 /* generate replacement */
3477 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003478 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3480 Py_DECREF(repunicode);
3481 goto onError;
3482 }
3483 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3484 *str++ = *uni2;
3485 p = startp + newpos;
3486 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003487 }
3488 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003489 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003490 /* Resize if we allocated to much */
3491 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003492 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003493 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003494 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003495 }
3496 Py_XDECREF(exc);
3497 Py_XDECREF(errorHandler);
3498 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003500 onError:
3501 Py_XDECREF(res);
3502 Py_XDECREF(exc);
3503 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003504 return NULL;
3505}
3506
3507PyObject *PyUnicode_Translate(PyObject *str,
3508 PyObject *mapping,
3509 const char *errors)
3510{
3511 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003512
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513 str = PyUnicode_FromObject(str);
3514 if (str == NULL)
3515 goto onError;
3516 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3517 PyUnicode_GET_SIZE(str),
3518 mapping,
3519 errors);
3520 Py_DECREF(str);
3521 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003522
Guido van Rossumd57fd912000-03-10 22:53:23 +00003523 onError:
3524 Py_XDECREF(str);
3525 return NULL;
3526}
Tim Petersced69f82003-09-16 20:30:58 +00003527
Guido van Rossum9e896b32000-04-05 20:11:21 +00003528/* --- Decimal Encoder ---------------------------------------------------- */
3529
3530int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3531 int length,
3532 char *output,
3533 const char *errors)
3534{
3535 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003536 PyObject *errorHandler = NULL;
3537 PyObject *exc = NULL;
3538 const char *encoding = "decimal";
3539 const char *reason = "invalid decimal Unicode string";
3540 /* the following variable is used for caching string comparisons
3541 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3542 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003543
3544 if (output == NULL) {
3545 PyErr_BadArgument();
3546 return -1;
3547 }
3548
3549 p = s;
3550 end = s + length;
3551 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003552 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003553 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 PyObject *repunicode;
3555 int repsize;
3556 int newpos;
3557 Py_UNICODE *uni2;
3558 Py_UNICODE *collstart;
3559 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003560
Guido van Rossum9e896b32000-04-05 20:11:21 +00003561 if (Py_UNICODE_ISSPACE(ch)) {
3562 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003564 continue;
3565 }
3566 decimal = Py_UNICODE_TODECIMAL(ch);
3567 if (decimal >= 0) {
3568 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003570 continue;
3571 }
Guido van Rossumba477042000-04-06 18:18:10 +00003572 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003573 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003575 continue;
3576 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003577 /* All other characters are considered unencodable */
3578 collstart = p;
3579 collend = p+1;
3580 while (collend < end) {
3581 if ((0 < *collend && *collend < 256) ||
3582 !Py_UNICODE_ISSPACE(*collend) ||
3583 Py_UNICODE_TODECIMAL(*collend))
3584 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003585 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003586 /* cache callback name lookup
3587 * (if not done yet, i.e. it's the first error) */
3588 if (known_errorHandler==-1) {
3589 if ((errors==NULL) || (!strcmp(errors, "strict")))
3590 known_errorHandler = 1;
3591 else if (!strcmp(errors, "replace"))
3592 known_errorHandler = 2;
3593 else if (!strcmp(errors, "ignore"))
3594 known_errorHandler = 3;
3595 else if (!strcmp(errors, "xmlcharrefreplace"))
3596 known_errorHandler = 4;
3597 else
3598 known_errorHandler = 0;
3599 }
3600 switch (known_errorHandler) {
3601 case 1: /* strict */
3602 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3603 goto onError;
3604 case 2: /* replace */
3605 for (p = collstart; p < collend; ++p)
3606 *output++ = '?';
3607 /* fall through */
3608 case 3: /* ignore */
3609 p = collend;
3610 break;
3611 case 4: /* xmlcharrefreplace */
3612 /* generate replacement (temporarily (mis)uses p) */
3613 for (p = collstart; p < collend; ++p)
3614 output += sprintf(output, "&#%d;", (int)*p);
3615 p = collend;
3616 break;
3617 default:
3618 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3619 encoding, reason, s, length, &exc,
3620 collstart-s, collend-s, &newpos);
3621 if (repunicode == NULL)
3622 goto onError;
3623 /* generate replacement */
3624 repsize = PyUnicode_GET_SIZE(repunicode);
3625 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3626 Py_UNICODE ch = *uni2;
3627 if (Py_UNICODE_ISSPACE(ch))
3628 *output++ = ' ';
3629 else {
3630 decimal = Py_UNICODE_TODECIMAL(ch);
3631 if (decimal >= 0)
3632 *output++ = '0' + decimal;
3633 else if (0 < ch && ch < 256)
3634 *output++ = (char)ch;
3635 else {
3636 Py_DECREF(repunicode);
3637 raise_encode_exception(&exc, encoding,
3638 s, length, collstart-s, collend-s, reason);
3639 goto onError;
3640 }
3641 }
3642 }
3643 p = s + newpos;
3644 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003645 }
3646 }
3647 /* 0-terminate the output string */
3648 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003649 Py_XDECREF(exc);
3650 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003651 return 0;
3652
3653 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003654 Py_XDECREF(exc);
3655 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003656 return -1;
3657}
3658
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659/* --- Helpers ------------------------------------------------------------ */
3660
Tim Petersced69f82003-09-16 20:30:58 +00003661static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662int count(PyUnicodeObject *self,
3663 int start,
3664 int end,
3665 PyUnicodeObject *substring)
3666{
3667 int count = 0;
3668
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003669 if (start < 0)
3670 start += self->length;
3671 if (start < 0)
3672 start = 0;
3673 if (end > self->length)
3674 end = self->length;
3675 if (end < 0)
3676 end += self->length;
3677 if (end < 0)
3678 end = 0;
3679
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003680 if (substring->length == 0)
3681 return (end - start + 1);
3682
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 end -= substring->length;
3684
3685 while (start <= end)
3686 if (Py_UNICODE_MATCH(self, start, substring)) {
3687 count++;
3688 start += substring->length;
3689 } else
3690 start++;
3691
3692 return count;
3693}
3694
3695int PyUnicode_Count(PyObject *str,
3696 PyObject *substr,
3697 int start,
3698 int end)
3699{
3700 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003701
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702 str = PyUnicode_FromObject(str);
3703 if (str == NULL)
3704 return -1;
3705 substr = PyUnicode_FromObject(substr);
3706 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003707 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 return -1;
3709 }
Tim Petersced69f82003-09-16 20:30:58 +00003710
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 result = count((PyUnicodeObject *)str,
3712 start, end,
3713 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003714
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715 Py_DECREF(str);
3716 Py_DECREF(substr);
3717 return result;
3718}
3719
Tim Petersced69f82003-09-16 20:30:58 +00003720static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003721int findstring(PyUnicodeObject *self,
3722 PyUnicodeObject *substring,
3723 int start,
3724 int end,
3725 int direction)
3726{
3727 if (start < 0)
3728 start += self->length;
3729 if (start < 0)
3730 start = 0;
3731
Guido van Rossumd57fd912000-03-10 22:53:23 +00003732 if (end > self->length)
3733 end = self->length;
3734 if (end < 0)
3735 end += self->length;
3736 if (end < 0)
3737 end = 0;
3738
Guido van Rossum76afbd92002-08-20 17:29:29 +00003739 if (substring->length == 0)
3740 return (direction > 0) ? start : end;
3741
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742 end -= substring->length;
3743
3744 if (direction < 0) {
3745 for (; end >= start; end--)
3746 if (Py_UNICODE_MATCH(self, end, substring))
3747 return end;
3748 } else {
3749 for (; start <= end; start++)
3750 if (Py_UNICODE_MATCH(self, start, substring))
3751 return start;
3752 }
3753
3754 return -1;
3755}
3756
3757int PyUnicode_Find(PyObject *str,
3758 PyObject *substr,
3759 int start,
3760 int end,
3761 int direction)
3762{
3763 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003764
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765 str = PyUnicode_FromObject(str);
3766 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003767 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768 substr = PyUnicode_FromObject(substr);
3769 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003770 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003771 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003772 }
Tim Petersced69f82003-09-16 20:30:58 +00003773
Guido van Rossumd57fd912000-03-10 22:53:23 +00003774 result = findstring((PyUnicodeObject *)str,
3775 (PyUnicodeObject *)substr,
3776 start, end, direction);
3777 Py_DECREF(str);
3778 Py_DECREF(substr);
3779 return result;
3780}
3781
Tim Petersced69f82003-09-16 20:30:58 +00003782static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783int tailmatch(PyUnicodeObject *self,
3784 PyUnicodeObject *substring,
3785 int start,
3786 int end,
3787 int direction)
3788{
3789 if (start < 0)
3790 start += self->length;
3791 if (start < 0)
3792 start = 0;
3793
3794 if (substring->length == 0)
3795 return 1;
3796
3797 if (end > self->length)
3798 end = self->length;
3799 if (end < 0)
3800 end += self->length;
3801 if (end < 0)
3802 end = 0;
3803
3804 end -= substring->length;
3805 if (end < start)
3806 return 0;
3807
3808 if (direction > 0) {
3809 if (Py_UNICODE_MATCH(self, end, substring))
3810 return 1;
3811 } else {
3812 if (Py_UNICODE_MATCH(self, start, substring))
3813 return 1;
3814 }
3815
3816 return 0;
3817}
3818
3819int PyUnicode_Tailmatch(PyObject *str,
3820 PyObject *substr,
3821 int start,
3822 int end,
3823 int direction)
3824{
3825 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003826
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827 str = PyUnicode_FromObject(str);
3828 if (str == NULL)
3829 return -1;
3830 substr = PyUnicode_FromObject(substr);
3831 if (substr == NULL) {
3832 Py_DECREF(substr);
3833 return -1;
3834 }
Tim Petersced69f82003-09-16 20:30:58 +00003835
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836 result = tailmatch((PyUnicodeObject *)str,
3837 (PyUnicodeObject *)substr,
3838 start, end, direction);
3839 Py_DECREF(str);
3840 Py_DECREF(substr);
3841 return result;
3842}
3843
Tim Petersced69f82003-09-16 20:30:58 +00003844static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003845const Py_UNICODE *findchar(const Py_UNICODE *s,
3846 int size,
3847 Py_UNICODE ch)
3848{
3849 /* like wcschr, but doesn't stop at NULL characters */
3850
3851 while (size-- > 0) {
3852 if (*s == ch)
3853 return s;
3854 s++;
3855 }
3856
3857 return NULL;
3858}
3859
3860/* Apply fixfct filter to the Unicode object self and return a
3861 reference to the modified object */
3862
Tim Petersced69f82003-09-16 20:30:58 +00003863static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864PyObject *fixup(PyUnicodeObject *self,
3865 int (*fixfct)(PyUnicodeObject *s))
3866{
3867
3868 PyUnicodeObject *u;
3869
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003870 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871 if (u == NULL)
3872 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003873
3874 Py_UNICODE_COPY(u->str, self->str, self->length);
3875
Tim Peters7a29bd52001-09-12 03:03:31 +00003876 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003877 /* fixfct should return TRUE if it modified the buffer. If
3878 FALSE, return a reference to the original buffer instead
3879 (to save space, not time) */
3880 Py_INCREF(self);
3881 Py_DECREF(u);
3882 return (PyObject*) self;
3883 }
3884 return (PyObject*) u;
3885}
3886
Tim Petersced69f82003-09-16 20:30:58 +00003887static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003888int fixupper(PyUnicodeObject *self)
3889{
3890 int len = self->length;
3891 Py_UNICODE *s = self->str;
3892 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003893
Guido van Rossumd57fd912000-03-10 22:53:23 +00003894 while (len-- > 0) {
3895 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003896
Guido van Rossumd57fd912000-03-10 22:53:23 +00003897 ch = Py_UNICODE_TOUPPER(*s);
3898 if (ch != *s) {
3899 status = 1;
3900 *s = ch;
3901 }
3902 s++;
3903 }
3904
3905 return status;
3906}
3907
Tim Petersced69f82003-09-16 20:30:58 +00003908static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003909int fixlower(PyUnicodeObject *self)
3910{
3911 int len = self->length;
3912 Py_UNICODE *s = self->str;
3913 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003914
Guido van Rossumd57fd912000-03-10 22:53:23 +00003915 while (len-- > 0) {
3916 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003917
Guido van Rossumd57fd912000-03-10 22:53:23 +00003918 ch = Py_UNICODE_TOLOWER(*s);
3919 if (ch != *s) {
3920 status = 1;
3921 *s = ch;
3922 }
3923 s++;
3924 }
3925
3926 return status;
3927}
3928
Tim Petersced69f82003-09-16 20:30:58 +00003929static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930int fixswapcase(PyUnicodeObject *self)
3931{
3932 int len = self->length;
3933 Py_UNICODE *s = self->str;
3934 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003935
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936 while (len-- > 0) {
3937 if (Py_UNICODE_ISUPPER(*s)) {
3938 *s = Py_UNICODE_TOLOWER(*s);
3939 status = 1;
3940 } else if (Py_UNICODE_ISLOWER(*s)) {
3941 *s = Py_UNICODE_TOUPPER(*s);
3942 status = 1;
3943 }
3944 s++;
3945 }
3946
3947 return status;
3948}
3949
Tim Petersced69f82003-09-16 20:30:58 +00003950static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951int fixcapitalize(PyUnicodeObject *self)
3952{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003953 int len = self->length;
3954 Py_UNICODE *s = self->str;
3955 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003956
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003957 if (len == 0)
3958 return 0;
3959 if (Py_UNICODE_ISLOWER(*s)) {
3960 *s = Py_UNICODE_TOUPPER(*s);
3961 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003963 s++;
3964 while (--len > 0) {
3965 if (Py_UNICODE_ISUPPER(*s)) {
3966 *s = Py_UNICODE_TOLOWER(*s);
3967 status = 1;
3968 }
3969 s++;
3970 }
3971 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003972}
3973
3974static
3975int fixtitle(PyUnicodeObject *self)
3976{
3977 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3978 register Py_UNICODE *e;
3979 int previous_is_cased;
3980
3981 /* Shortcut for single character strings */
3982 if (PyUnicode_GET_SIZE(self) == 1) {
3983 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3984 if (*p != ch) {
3985 *p = ch;
3986 return 1;
3987 }
3988 else
3989 return 0;
3990 }
Tim Petersced69f82003-09-16 20:30:58 +00003991
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992 e = p + PyUnicode_GET_SIZE(self);
3993 previous_is_cased = 0;
3994 for (; p < e; p++) {
3995 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00003996
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997 if (previous_is_cased)
3998 *p = Py_UNICODE_TOLOWER(ch);
3999 else
4000 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004001
4002 if (Py_UNICODE_ISLOWER(ch) ||
4003 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 Py_UNICODE_ISTITLE(ch))
4005 previous_is_cased = 1;
4006 else
4007 previous_is_cased = 0;
4008 }
4009 return 1;
4010}
4011
Tim Peters8ce9f162004-08-27 01:49:32 +00004012PyObject *
4013PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014{
Tim Peters8ce9f162004-08-27 01:49:32 +00004015 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004016 const Py_UNICODE blank = ' ';
4017 const Py_UNICODE *sep = &blank;
4018 size_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004019 PyUnicodeObject *res = NULL; /* the result */
4020 size_t res_alloc = 100; /* # allocated bytes for string in res */
4021 size_t res_used; /* # used bytes */
4022 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4023 PyObject *fseq; /* PySequence_Fast(seq) */
4024 int seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004025 PyObject *item;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026 int i;
4027
Tim Peters05eba1f2004-08-27 21:32:02 +00004028 fseq = PySequence_Fast(seq, "");
4029 if (fseq == NULL) {
4030 if (PyErr_ExceptionMatches(PyExc_TypeError))
4031 PyErr_Format(PyExc_TypeError,
4032 "sequence expected, %.80s found",
4033 seq->ob_type->tp_name);
4034 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004035 }
4036
Tim Peters91879ab2004-08-27 22:35:44 +00004037 /* Grrrr. A codec may be invoked to convert str objects to
4038 * Unicode, and so it's possible to call back into Python code
4039 * during PyUnicode_FromObject(), and so it's possible for a sick
4040 * codec to change the size of fseq (if seq is a list). Therefore
4041 * we have to keep refetching the size -- can't assume seqlen
4042 * is invariant.
4043 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004044 seqlen = PySequence_Fast_GET_SIZE(fseq);
4045 /* If empty sequence, return u"". */
4046 if (seqlen == 0) {
4047 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4048 goto Done;
4049 }
4050 /* If singleton sequence with an exact Unicode, return that. */
4051 if (seqlen == 1) {
4052 item = PySequence_Fast_GET_ITEM(fseq, 0);
4053 if (PyUnicode_CheckExact(item)) {
4054 Py_INCREF(item);
4055 res = (PyUnicodeObject *)item;
4056 goto Done;
4057 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004058 }
4059
Tim Peters05eba1f2004-08-27 21:32:02 +00004060 /* At least two items to join, or one that isn't exact Unicode. */
4061 if (seqlen > 1) {
4062 /* Set up sep and seplen -- they're needed. */
4063 if (separator == NULL) {
4064 sep = &blank;
4065 seplen = 1;
4066 }
4067 else {
4068 internal_separator = PyUnicode_FromObject(separator);
4069 if (internal_separator == NULL)
4070 goto onError;
4071 sep = PyUnicode_AS_UNICODE(internal_separator);
4072 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004073 /* In case PyUnicode_FromObject() mutated seq. */
4074 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004075 }
4076 }
4077
4078 /* Get space. */
4079 res = _PyUnicode_New((int)res_alloc);
4080 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004081 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004082 res_p = PyUnicode_AS_UNICODE(res);
4083 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004084
Tim Peters05eba1f2004-08-27 21:32:02 +00004085 for (i = 0; i < seqlen; ++i) {
4086 size_t itemlen;
4087 size_t new_res_used;
4088
4089 item = PySequence_Fast_GET_ITEM(fseq, i);
4090 /* Convert item to Unicode. */
4091 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4092 PyErr_Format(PyExc_TypeError,
4093 "sequence item %i: expected string or Unicode,"
4094 " %.80s found",
4095 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004096 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004097 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004098 item = PyUnicode_FromObject(item);
4099 if (item == NULL)
4100 goto onError;
4101 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004102
Tim Peters91879ab2004-08-27 22:35:44 +00004103 /* In case PyUnicode_FromObject() mutated seq. */
4104 seqlen = PySequence_Fast_GET_SIZE(fseq);
4105
Tim Peters8ce9f162004-08-27 01:49:32 +00004106 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004108 new_res_used = res_used + itemlen;
4109 if (new_res_used < res_used || new_res_used > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004110 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004111 if (i < seqlen - 1) {
4112 new_res_used += seplen;
4113 if (new_res_used < res_used || new_res_used > INT_MAX)
4114 goto Overflow;
4115 }
4116 if (new_res_used > res_alloc) {
4117 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004118 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004119 size_t oldsize = res_alloc;
4120 res_alloc += res_alloc;
4121 if (res_alloc < oldsize || res_alloc > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004122 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004123 } while (new_res_used > res_alloc);
4124 if (_PyUnicode_Resize(&res, (int)res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004125 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004127 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004128 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004130
4131 /* Copy item, and maybe the separator. */
4132 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), (int)itemlen);
4133 res_p += itemlen;
4134 if (i < seqlen - 1) {
4135 Py_UNICODE_COPY(res_p, sep, (int)seplen);
4136 res_p += seplen;
4137 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004139 res_used = new_res_used;
4140 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004141
Tim Peters05eba1f2004-08-27 21:32:02 +00004142 /* Shrink res to match the used area; this probably can't fail,
4143 * but it's cheap to check.
4144 */
4145 if (_PyUnicode_Resize(&res, (int)res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004146 goto onError;
4147
4148 Done:
4149 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004150 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004151 return (PyObject *)res;
4152
Tim Peters8ce9f162004-08-27 01:49:32 +00004153 Overflow:
4154 PyErr_SetString(PyExc_OverflowError,
4155 "join() is too long for a Python string");
4156 Py_DECREF(item);
4157 /* fall through */
4158
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004160 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004161 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004162 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004163 return NULL;
4164}
4165
Tim Petersced69f82003-09-16 20:30:58 +00004166static
4167PyUnicodeObject *pad(PyUnicodeObject *self,
4168 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169 int right,
4170 Py_UNICODE fill)
4171{
4172 PyUnicodeObject *u;
4173
4174 if (left < 0)
4175 left = 0;
4176 if (right < 0)
4177 right = 0;
4178
Tim Peters7a29bd52001-09-12 03:03:31 +00004179 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180 Py_INCREF(self);
4181 return self;
4182 }
4183
4184 u = _PyUnicode_New(left + self->length + right);
4185 if (u) {
4186 if (left)
4187 Py_UNICODE_FILL(u->str, fill, left);
4188 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4189 if (right)
4190 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4191 }
4192
4193 return u;
4194}
4195
4196#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004197 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198 if (!str) \
4199 goto onError; \
4200 if (PyList_Append(list, str)) { \
4201 Py_DECREF(str); \
4202 goto onError; \
4203 } \
4204 else \
4205 Py_DECREF(str);
4206
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004207#define SPLIT_INSERT(data, left, right) \
4208 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4209 if (!str) \
4210 goto onError; \
4211 if (PyList_Insert(list, 0, str)) { \
4212 Py_DECREF(str); \
4213 goto onError; \
4214 } \
4215 else \
4216 Py_DECREF(str);
4217
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218static
4219PyObject *split_whitespace(PyUnicodeObject *self,
4220 PyObject *list,
4221 int maxcount)
4222{
4223 register int i;
4224 register int j;
4225 int len = self->length;
4226 PyObject *str;
4227
4228 for (i = j = 0; i < len; ) {
4229 /* find a token */
4230 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4231 i++;
4232 j = i;
4233 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4234 i++;
4235 if (j < i) {
4236 if (maxcount-- <= 0)
4237 break;
4238 SPLIT_APPEND(self->str, j, i);
4239 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4240 i++;
4241 j = i;
4242 }
4243 }
4244 if (j < len) {
4245 SPLIT_APPEND(self->str, j, len);
4246 }
4247 return list;
4248
4249 onError:
4250 Py_DECREF(list);
4251 return NULL;
4252}
4253
4254PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004255 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004256{
4257 register int i;
4258 register int j;
4259 int len;
4260 PyObject *list;
4261 PyObject *str;
4262 Py_UNICODE *data;
4263
4264 string = PyUnicode_FromObject(string);
4265 if (string == NULL)
4266 return NULL;
4267 data = PyUnicode_AS_UNICODE(string);
4268 len = PyUnicode_GET_SIZE(string);
4269
Guido van Rossumd57fd912000-03-10 22:53:23 +00004270 list = PyList_New(0);
4271 if (!list)
4272 goto onError;
4273
4274 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004275 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004276
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277 /* Find a line and append it */
4278 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4279 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280
4281 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004282 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283 if (i < len) {
4284 if (data[i] == '\r' && i + 1 < len &&
4285 data[i+1] == '\n')
4286 i += 2;
4287 else
4288 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004289 if (keepends)
4290 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004291 }
Guido van Rossum86662912000-04-11 15:38:46 +00004292 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293 j = i;
4294 }
4295 if (j < len) {
4296 SPLIT_APPEND(data, j, len);
4297 }
4298
4299 Py_DECREF(string);
4300 return list;
4301
4302 onError:
4303 Py_DECREF(list);
4304 Py_DECREF(string);
4305 return NULL;
4306}
4307
Tim Petersced69f82003-09-16 20:30:58 +00004308static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309PyObject *split_char(PyUnicodeObject *self,
4310 PyObject *list,
4311 Py_UNICODE ch,
4312 int maxcount)
4313{
4314 register int i;
4315 register int j;
4316 int len = self->length;
4317 PyObject *str;
4318
4319 for (i = j = 0; i < len; ) {
4320 if (self->str[i] == ch) {
4321 if (maxcount-- <= 0)
4322 break;
4323 SPLIT_APPEND(self->str, j, i);
4324 i = j = i + 1;
4325 } else
4326 i++;
4327 }
4328 if (j <= len) {
4329 SPLIT_APPEND(self->str, j, len);
4330 }
4331 return list;
4332
4333 onError:
4334 Py_DECREF(list);
4335 return NULL;
4336}
4337
Tim Petersced69f82003-09-16 20:30:58 +00004338static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339PyObject *split_substring(PyUnicodeObject *self,
4340 PyObject *list,
4341 PyUnicodeObject *substring,
4342 int maxcount)
4343{
4344 register int i;
4345 register int j;
4346 int len = self->length;
4347 int sublen = substring->length;
4348 PyObject *str;
4349
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004350 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351 if (Py_UNICODE_MATCH(self, i, substring)) {
4352 if (maxcount-- <= 0)
4353 break;
4354 SPLIT_APPEND(self->str, j, i);
4355 i = j = i + sublen;
4356 } else
4357 i++;
4358 }
4359 if (j <= len) {
4360 SPLIT_APPEND(self->str, j, len);
4361 }
4362 return list;
4363
4364 onError:
4365 Py_DECREF(list);
4366 return NULL;
4367}
4368
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004369static
4370PyObject *rsplit_whitespace(PyUnicodeObject *self,
4371 PyObject *list,
4372 int maxcount)
4373{
4374 register int i;
4375 register int j;
4376 int len = self->length;
4377 PyObject *str;
4378
4379 for (i = j = len - 1; i >= 0; ) {
4380 /* find a token */
4381 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4382 i--;
4383 j = i;
4384 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4385 i--;
4386 if (j > i) {
4387 if (maxcount-- <= 0)
4388 break;
4389 SPLIT_INSERT(self->str, i + 1, j + 1);
4390 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4391 i--;
4392 j = i;
4393 }
4394 }
4395 if (j >= 0) {
4396 SPLIT_INSERT(self->str, 0, j + 1);
4397 }
4398 return list;
4399
4400 onError:
4401 Py_DECREF(list);
4402 return NULL;
4403}
4404
4405static
4406PyObject *rsplit_char(PyUnicodeObject *self,
4407 PyObject *list,
4408 Py_UNICODE ch,
4409 int maxcount)
4410{
4411 register int i;
4412 register int j;
4413 int len = self->length;
4414 PyObject *str;
4415
4416 for (i = j = len - 1; i >= 0; ) {
4417 if (self->str[i] == ch) {
4418 if (maxcount-- <= 0)
4419 break;
4420 SPLIT_INSERT(self->str, i + 1, j + 1);
4421 j = i = i - 1;
4422 } else
4423 i--;
4424 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004425 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004426 SPLIT_INSERT(self->str, 0, j + 1);
4427 }
4428 return list;
4429
4430 onError:
4431 Py_DECREF(list);
4432 return NULL;
4433}
4434
4435static
4436PyObject *rsplit_substring(PyUnicodeObject *self,
4437 PyObject *list,
4438 PyUnicodeObject *substring,
4439 int maxcount)
4440{
4441 register int i;
4442 register int j;
4443 int len = self->length;
4444 int sublen = substring->length;
4445 PyObject *str;
4446
4447 for (i = len - sublen, j = len; i >= 0; ) {
4448 if (Py_UNICODE_MATCH(self, i, substring)) {
4449 if (maxcount-- <= 0)
4450 break;
4451 SPLIT_INSERT(self->str, i + sublen, j);
4452 j = i;
4453 i -= sublen;
4454 } else
4455 i--;
4456 }
4457 if (j >= 0) {
4458 SPLIT_INSERT(self->str, 0, j);
4459 }
4460 return list;
4461
4462 onError:
4463 Py_DECREF(list);
4464 return NULL;
4465}
4466
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004468#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469
4470static
4471PyObject *split(PyUnicodeObject *self,
4472 PyUnicodeObject *substring,
4473 int maxcount)
4474{
4475 PyObject *list;
4476
4477 if (maxcount < 0)
4478 maxcount = INT_MAX;
4479
4480 list = PyList_New(0);
4481 if (!list)
4482 return NULL;
4483
4484 if (substring == NULL)
4485 return split_whitespace(self,list,maxcount);
4486
4487 else if (substring->length == 1)
4488 return split_char(self,list,substring->str[0],maxcount);
4489
4490 else if (substring->length == 0) {
4491 Py_DECREF(list);
4492 PyErr_SetString(PyExc_ValueError, "empty separator");
4493 return NULL;
4494 }
4495 else
4496 return split_substring(self,list,substring,maxcount);
4497}
4498
Tim Petersced69f82003-09-16 20:30:58 +00004499static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004500PyObject *rsplit(PyUnicodeObject *self,
4501 PyUnicodeObject *substring,
4502 int maxcount)
4503{
4504 PyObject *list;
4505
4506 if (maxcount < 0)
4507 maxcount = INT_MAX;
4508
4509 list = PyList_New(0);
4510 if (!list)
4511 return NULL;
4512
4513 if (substring == NULL)
4514 return rsplit_whitespace(self,list,maxcount);
4515
4516 else if (substring->length == 1)
4517 return rsplit_char(self,list,substring->str[0],maxcount);
4518
4519 else if (substring->length == 0) {
4520 Py_DECREF(list);
4521 PyErr_SetString(PyExc_ValueError, "empty separator");
4522 return NULL;
4523 }
4524 else
4525 return rsplit_substring(self,list,substring,maxcount);
4526}
4527
4528static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004529PyObject *replace(PyUnicodeObject *self,
4530 PyUnicodeObject *str1,
4531 PyUnicodeObject *str2,
4532 int maxcount)
4533{
4534 PyUnicodeObject *u;
4535
4536 if (maxcount < 0)
4537 maxcount = INT_MAX;
4538
4539 if (str1->length == 1 && str2->length == 1) {
4540 int i;
4541
4542 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004543 if (!findchar(self->str, self->length, str1->str[0]) &&
4544 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004545 /* nothing to replace, return original string */
4546 Py_INCREF(self);
4547 u = self;
4548 } else {
4549 Py_UNICODE u1 = str1->str[0];
4550 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004551
Guido van Rossumd57fd912000-03-10 22:53:23 +00004552 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004553 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004554 self->length
4555 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004556 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004557 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004558 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559 for (i = 0; i < u->length; i++)
4560 if (u->str[i] == u1) {
4561 if (--maxcount < 0)
4562 break;
4563 u->str[i] = u2;
4564 }
4565 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004566 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567
4568 } else {
4569 int n, i;
4570 Py_UNICODE *p;
4571
4572 /* replace strings */
4573 n = count(self, 0, self->length, str1);
4574 if (n > maxcount)
4575 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004576 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004577 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004578 if (PyUnicode_CheckExact(self)) {
4579 Py_INCREF(self);
4580 u = self;
4581 }
4582 else {
4583 u = (PyUnicodeObject *)
4584 PyUnicode_FromUnicode(self->str, self->length);
4585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586 } else {
4587 u = _PyUnicode_New(
4588 self->length + n * (str2->length - str1->length));
4589 if (u) {
4590 i = 0;
4591 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004592 if (str1->length > 0) {
4593 while (i <= self->length - str1->length)
4594 if (Py_UNICODE_MATCH(self, i, str1)) {
4595 /* replace string segment */
4596 Py_UNICODE_COPY(p, str2->str, str2->length);
4597 p += str2->length;
4598 i += str1->length;
4599 if (--n <= 0) {
4600 /* copy remaining part */
4601 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4602 break;
4603 }
4604 } else
4605 *p++ = self->str[i++];
4606 } else {
4607 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004608 Py_UNICODE_COPY(p, str2->str, str2->length);
4609 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004610 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004613 }
4614 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4615 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616 }
4617 }
4618 }
Tim Petersced69f82003-09-16 20:30:58 +00004619
Guido van Rossumd57fd912000-03-10 22:53:23 +00004620 return (PyObject *) u;
4621}
4622
4623/* --- Unicode Object Methods --------------------------------------------- */
4624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004625PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626"S.title() -> unicode\n\
4627\n\
4628Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004629characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630
4631static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004632unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004633{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634 return fixup(self, fixtitle);
4635}
4636
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004637PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004638"S.capitalize() -> unicode\n\
4639\n\
4640Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004641have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004642
4643static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004644unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646 return fixup(self, fixcapitalize);
4647}
4648
4649#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004650PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004651"S.capwords() -> unicode\n\
4652\n\
4653Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004654normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004655
4656static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004657unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004658{
4659 PyObject *list;
4660 PyObject *item;
4661 int i;
4662
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663 /* Split into words */
4664 list = split(self, NULL, -1);
4665 if (!list)
4666 return NULL;
4667
4668 /* Capitalize each word */
4669 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4670 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4671 fixcapitalize);
4672 if (item == NULL)
4673 goto onError;
4674 Py_DECREF(PyList_GET_ITEM(list, i));
4675 PyList_SET_ITEM(list, i, item);
4676 }
4677
4678 /* Join the words to form a new string */
4679 item = PyUnicode_Join(NULL, list);
4680
4681onError:
4682 Py_DECREF(list);
4683 return (PyObject *)item;
4684}
4685#endif
4686
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004687/* Argument converter. Coerces to a single unicode character */
4688
4689static int
4690convert_uc(PyObject *obj, void *addr)
4691{
4692 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4693 PyObject *uniobj;
4694 Py_UNICODE *unistr;
4695
4696 uniobj = PyUnicode_FromObject(obj);
4697 if (uniobj == NULL) {
4698 PyErr_SetString(PyExc_TypeError,
4699 "The fill character cannot be converted to Unicode");
4700 return 0;
4701 }
4702 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4703 PyErr_SetString(PyExc_TypeError,
4704 "The fill character must be exactly one character long");
4705 Py_DECREF(uniobj);
4706 return 0;
4707 }
4708 unistr = PyUnicode_AS_UNICODE(uniobj);
4709 *fillcharloc = unistr[0];
4710 Py_DECREF(uniobj);
4711 return 1;
4712}
4713
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004714PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004715"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004716\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004717Return S centered in a Unicode string of length width. Padding is\n\
4718done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719
4720static PyObject *
4721unicode_center(PyUnicodeObject *self, PyObject *args)
4722{
4723 int marg, left;
4724 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004725 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004727 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728 return NULL;
4729
Tim Peters7a29bd52001-09-12 03:03:31 +00004730 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 Py_INCREF(self);
4732 return (PyObject*) self;
4733 }
4734
4735 marg = width - self->length;
4736 left = marg / 2 + (marg & width & 1);
4737
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004738 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739}
4740
Marc-André Lemburge5034372000-08-08 08:04:29 +00004741#if 0
4742
4743/* This code should go into some future Unicode collation support
4744 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004745 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004746
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004747/* speedy UTF-16 code point order comparison */
4748/* gleaned from: */
4749/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4750
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004751static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004752{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004753 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004754 0, 0, 0, 0, 0, 0, 0, 0,
4755 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004756 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004757};
4758
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759static int
4760unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4761{
4762 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004763
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764 Py_UNICODE *s1 = str1->str;
4765 Py_UNICODE *s2 = str2->str;
4766
4767 len1 = str1->length;
4768 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004769
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004771 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004772
4773 c1 = *s1++;
4774 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004775
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004776 if (c1 > (1<<11) * 26)
4777 c1 += utf16Fixup[c1>>11];
4778 if (c2 > (1<<11) * 26)
4779 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004780 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004781
4782 if (c1 != c2)
4783 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004784
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004785 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786 }
4787
4788 return (len1 < len2) ? -1 : (len1 != len2);
4789}
4790
Marc-André Lemburge5034372000-08-08 08:04:29 +00004791#else
4792
4793static int
4794unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4795{
4796 register int len1, len2;
4797
4798 Py_UNICODE *s1 = str1->str;
4799 Py_UNICODE *s2 = str2->str;
4800
4801 len1 = str1->length;
4802 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004803
Marc-André Lemburge5034372000-08-08 08:04:29 +00004804 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004805 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004806
Fredrik Lundh45714e92001-06-26 16:39:36 +00004807 c1 = *s1++;
4808 c2 = *s2++;
4809
4810 if (c1 != c2)
4811 return (c1 < c2) ? -1 : 1;
4812
Marc-André Lemburge5034372000-08-08 08:04:29 +00004813 len1--; len2--;
4814 }
4815
4816 return (len1 < len2) ? -1 : (len1 != len2);
4817}
4818
4819#endif
4820
Guido van Rossumd57fd912000-03-10 22:53:23 +00004821int PyUnicode_Compare(PyObject *left,
4822 PyObject *right)
4823{
4824 PyUnicodeObject *u = NULL, *v = NULL;
4825 int result;
4826
4827 /* Coerce the two arguments */
4828 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4829 if (u == NULL)
4830 goto onError;
4831 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4832 if (v == NULL)
4833 goto onError;
4834
Thomas Wouters7e474022000-07-16 12:04:32 +00004835 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836 if (v == u) {
4837 Py_DECREF(u);
4838 Py_DECREF(v);
4839 return 0;
4840 }
4841
4842 result = unicode_compare(u, v);
4843
4844 Py_DECREF(u);
4845 Py_DECREF(v);
4846 return result;
4847
4848onError:
4849 Py_XDECREF(u);
4850 Py_XDECREF(v);
4851 return -1;
4852}
4853
Guido van Rossum403d68b2000-03-13 15:55:09 +00004854int PyUnicode_Contains(PyObject *container,
4855 PyObject *element)
4856{
4857 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004858 int result, size;
4859 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004860
4861 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004862 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004863 if (v == NULL) {
4864 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004865 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004866 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004867 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004868 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004869 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004870 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004871
Barry Warsaw817918c2002-08-06 16:58:21 +00004872 size = PyUnicode_GET_SIZE(v);
4873 rhs = PyUnicode_AS_UNICODE(v);
4874 lhs = PyUnicode_AS_UNICODE(u);
4875
Guido van Rossum403d68b2000-03-13 15:55:09 +00004876 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004877 if (size == 1) {
4878 end = lhs + PyUnicode_GET_SIZE(u);
4879 while (lhs < end) {
4880 if (*lhs++ == *rhs) {
4881 result = 1;
4882 break;
4883 }
4884 }
4885 }
4886 else {
4887 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4888 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004889 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004890 result = 1;
4891 break;
4892 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004893 }
4894 }
4895
4896 Py_DECREF(u);
4897 Py_DECREF(v);
4898 return result;
4899
4900onError:
4901 Py_XDECREF(u);
4902 Py_XDECREF(v);
4903 return -1;
4904}
4905
Guido van Rossumd57fd912000-03-10 22:53:23 +00004906/* Concat to string or Unicode object giving a new Unicode object. */
4907
4908PyObject *PyUnicode_Concat(PyObject *left,
4909 PyObject *right)
4910{
4911 PyUnicodeObject *u = NULL, *v = NULL, *w;
4912
4913 /* Coerce the two arguments */
4914 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4915 if (u == NULL)
4916 goto onError;
4917 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4918 if (v == NULL)
4919 goto onError;
4920
4921 /* Shortcuts */
4922 if (v == unicode_empty) {
4923 Py_DECREF(v);
4924 return (PyObject *)u;
4925 }
4926 if (u == unicode_empty) {
4927 Py_DECREF(u);
4928 return (PyObject *)v;
4929 }
4930
4931 /* Concat the two Unicode strings */
4932 w = _PyUnicode_New(u->length + v->length);
4933 if (w == NULL)
4934 goto onError;
4935 Py_UNICODE_COPY(w->str, u->str, u->length);
4936 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4937
4938 Py_DECREF(u);
4939 Py_DECREF(v);
4940 return (PyObject *)w;
4941
4942onError:
4943 Py_XDECREF(u);
4944 Py_XDECREF(v);
4945 return NULL;
4946}
4947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004948PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949"S.count(sub[, start[, end]]) -> int\n\
4950\n\
4951Return the number of occurrences of substring sub in Unicode string\n\
4952S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004953interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004954
4955static PyObject *
4956unicode_count(PyUnicodeObject *self, PyObject *args)
4957{
4958 PyUnicodeObject *substring;
4959 int start = 0;
4960 int end = INT_MAX;
4961 PyObject *result;
4962
Guido van Rossumb8872e62000-05-09 14:14:27 +00004963 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4964 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965 return NULL;
4966
4967 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4968 (PyObject *)substring);
4969 if (substring == NULL)
4970 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004971
Guido van Rossumd57fd912000-03-10 22:53:23 +00004972 if (start < 0)
4973 start += self->length;
4974 if (start < 0)
4975 start = 0;
4976 if (end > self->length)
4977 end = self->length;
4978 if (end < 0)
4979 end += self->length;
4980 if (end < 0)
4981 end = 0;
4982
4983 result = PyInt_FromLong((long) count(self, start, end, substring));
4984
4985 Py_DECREF(substring);
4986 return result;
4987}
4988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004989PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004990"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004992Encodes S using the codec registered for encoding. encoding defaults\n\
4993to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004994handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004995a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4996'xmlcharrefreplace' as well as any other name registered with\n\
4997codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998
4999static PyObject *
5000unicode_encode(PyUnicodeObject *self, PyObject *args)
5001{
5002 char *encoding = NULL;
5003 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005004 PyObject *v;
5005
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5007 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005008 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005009 if (v == NULL)
5010 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005011 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5012 PyErr_Format(PyExc_TypeError,
5013 "encoder did not return a string/unicode object "
5014 "(type=%.400s)",
5015 v->ob_type->tp_name);
5016 Py_DECREF(v);
5017 return NULL;
5018 }
5019 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005020
5021 onError:
5022 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005023}
5024
5025PyDoc_STRVAR(decode__doc__,
5026"S.decode([encoding[,errors]]) -> string or unicode\n\
5027\n\
5028Decodes S using the codec registered for encoding. encoding defaults\n\
5029to the default encoding. errors may be given to set a different error\n\
5030handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5031a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5032as well as any other name registerd with codecs.register_error that is\n\
5033able to handle UnicodeDecodeErrors.");
5034
5035static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005036unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005037{
5038 char *encoding = NULL;
5039 char *errors = NULL;
5040 PyObject *v;
5041
5042 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5043 return NULL;
5044 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005045 if (v == NULL)
5046 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005047 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5048 PyErr_Format(PyExc_TypeError,
5049 "decoder did not return a string/unicode object "
5050 "(type=%.400s)",
5051 v->ob_type->tp_name);
5052 Py_DECREF(v);
5053 return NULL;
5054 }
5055 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005056
5057 onError:
5058 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059}
5060
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005061PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062"S.expandtabs([tabsize]) -> unicode\n\
5063\n\
5064Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005065If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066
5067static PyObject*
5068unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5069{
5070 Py_UNICODE *e;
5071 Py_UNICODE *p;
5072 Py_UNICODE *q;
5073 int i, j;
5074 PyUnicodeObject *u;
5075 int tabsize = 8;
5076
5077 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5078 return NULL;
5079
Thomas Wouters7e474022000-07-16 12:04:32 +00005080 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081 i = j = 0;
5082 e = self->str + self->length;
5083 for (p = self->str; p < e; p++)
5084 if (*p == '\t') {
5085 if (tabsize > 0)
5086 j += tabsize - (j % tabsize);
5087 }
5088 else {
5089 j++;
5090 if (*p == '\n' || *p == '\r') {
5091 i += j;
5092 j = 0;
5093 }
5094 }
5095
5096 /* Second pass: create output string and fill it */
5097 u = _PyUnicode_New(i + j);
5098 if (!u)
5099 return NULL;
5100
5101 j = 0;
5102 q = u->str;
5103
5104 for (p = self->str; p < e; p++)
5105 if (*p == '\t') {
5106 if (tabsize > 0) {
5107 i = tabsize - (j % tabsize);
5108 j += i;
5109 while (i--)
5110 *q++ = ' ';
5111 }
5112 }
5113 else {
5114 j++;
5115 *q++ = *p;
5116 if (*p == '\n' || *p == '\r')
5117 j = 0;
5118 }
5119
5120 return (PyObject*) u;
5121}
5122
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005123PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124"S.find(sub [,start [,end]]) -> int\n\
5125\n\
5126Return the lowest index in S where substring sub is found,\n\
5127such that sub is contained within s[start,end]. Optional\n\
5128arguments start and end are interpreted as in slice notation.\n\
5129\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005130Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131
5132static PyObject *
5133unicode_find(PyUnicodeObject *self, PyObject *args)
5134{
5135 PyUnicodeObject *substring;
5136 int start = 0;
5137 int end = INT_MAX;
5138 PyObject *result;
5139
Guido van Rossumb8872e62000-05-09 14:14:27 +00005140 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5141 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142 return NULL;
5143 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5144 (PyObject *)substring);
5145 if (substring == NULL)
5146 return NULL;
5147
5148 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
5149
5150 Py_DECREF(substring);
5151 return result;
5152}
5153
5154static PyObject *
5155unicode_getitem(PyUnicodeObject *self, int index)
5156{
5157 if (index < 0 || index >= self->length) {
5158 PyErr_SetString(PyExc_IndexError, "string index out of range");
5159 return NULL;
5160 }
5161
5162 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5163}
5164
5165static long
5166unicode_hash(PyUnicodeObject *self)
5167{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005168 /* Since Unicode objects compare equal to their ASCII string
5169 counterparts, they should use the individual character values
5170 as basis for their hash value. This is needed to assure that
5171 strings and Unicode objects behave in the same way as
5172 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173
Fredrik Lundhdde61642000-07-10 18:27:47 +00005174 register int len;
5175 register Py_UNICODE *p;
5176 register long x;
5177
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 if (self->hash != -1)
5179 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005180 len = PyUnicode_GET_SIZE(self);
5181 p = PyUnicode_AS_UNICODE(self);
5182 x = *p << 7;
5183 while (--len >= 0)
5184 x = (1000003*x) ^ *p++;
5185 x ^= PyUnicode_GET_SIZE(self);
5186 if (x == -1)
5187 x = -2;
5188 self->hash = x;
5189 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190}
5191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005192PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193"S.index(sub [,start [,end]]) -> int\n\
5194\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005195Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196
5197static PyObject *
5198unicode_index(PyUnicodeObject *self, PyObject *args)
5199{
5200 int result;
5201 PyUnicodeObject *substring;
5202 int start = 0;
5203 int end = INT_MAX;
5204
Guido van Rossumb8872e62000-05-09 14:14:27 +00005205 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5206 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005208
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5210 (PyObject *)substring);
5211 if (substring == NULL)
5212 return NULL;
5213
5214 result = findstring(self, substring, start, end, 1);
5215
5216 Py_DECREF(substring);
5217 if (result < 0) {
5218 PyErr_SetString(PyExc_ValueError, "substring not found");
5219 return NULL;
5220 }
5221 return PyInt_FromLong(result);
5222}
5223
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005224PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005225"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005227Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005228at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229
5230static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005231unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232{
5233 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5234 register const Py_UNICODE *e;
5235 int cased;
5236
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237 /* Shortcut for single character strings */
5238 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005239 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005241 /* Special case for empty strings */
5242 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005243 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005244
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245 e = p + PyUnicode_GET_SIZE(self);
5246 cased = 0;
5247 for (; p < e; p++) {
5248 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005249
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005251 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252 else if (!cased && Py_UNICODE_ISLOWER(ch))
5253 cased = 1;
5254 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005255 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256}
5257
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005258PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005259"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005261Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005262at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263
5264static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005265unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266{
5267 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5268 register const Py_UNICODE *e;
5269 int cased;
5270
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271 /* Shortcut for single character strings */
5272 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005273 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005275 /* Special case for empty strings */
5276 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005277 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005278
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279 e = p + PyUnicode_GET_SIZE(self);
5280 cased = 0;
5281 for (; p < e; p++) {
5282 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005283
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005285 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286 else if (!cased && Py_UNICODE_ISUPPER(ch))
5287 cased = 1;
5288 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005289 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290}
5291
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005292PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005293"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005295Return True if S is a titlecased string and there is at least one\n\
5296character in S, i.e. upper- and titlecase characters may only\n\
5297follow uncased characters and lowercase characters only cased ones.\n\
5298Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299
5300static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005301unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302{
5303 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5304 register const Py_UNICODE *e;
5305 int cased, previous_is_cased;
5306
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 /* Shortcut for single character strings */
5308 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005309 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5310 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005312 /* Special case for empty strings */
5313 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005314 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005315
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 e = p + PyUnicode_GET_SIZE(self);
5317 cased = 0;
5318 previous_is_cased = 0;
5319 for (; p < e; p++) {
5320 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005321
Guido van Rossumd57fd912000-03-10 22:53:23 +00005322 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5323 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005324 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325 previous_is_cased = 1;
5326 cased = 1;
5327 }
5328 else if (Py_UNICODE_ISLOWER(ch)) {
5329 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005330 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 previous_is_cased = 1;
5332 cased = 1;
5333 }
5334 else
5335 previous_is_cased = 0;
5336 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005337 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338}
5339
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005340PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005341"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005343Return True if all characters in S are whitespace\n\
5344and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345
5346static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005347unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348{
5349 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5350 register const Py_UNICODE *e;
5351
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352 /* Shortcut for single character strings */
5353 if (PyUnicode_GET_SIZE(self) == 1 &&
5354 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005355 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005357 /* Special case for empty strings */
5358 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005359 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005360
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361 e = p + PyUnicode_GET_SIZE(self);
5362 for (; p < e; p++) {
5363 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005364 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005366 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367}
5368
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005369PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005370"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005371\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005372Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005373and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005374
5375static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005376unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005377{
5378 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5379 register const Py_UNICODE *e;
5380
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005381 /* Shortcut for single character strings */
5382 if (PyUnicode_GET_SIZE(self) == 1 &&
5383 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005384 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005385
5386 /* Special case for empty strings */
5387 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005388 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005389
5390 e = p + PyUnicode_GET_SIZE(self);
5391 for (; p < e; p++) {
5392 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005393 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005394 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005395 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005396}
5397
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005398PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005399"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005400\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005401Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005402and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005403
5404static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005405unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005406{
5407 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5408 register const Py_UNICODE *e;
5409
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005410 /* Shortcut for single character strings */
5411 if (PyUnicode_GET_SIZE(self) == 1 &&
5412 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005413 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005414
5415 /* Special case for empty strings */
5416 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005417 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005418
5419 e = p + PyUnicode_GET_SIZE(self);
5420 for (; p < e; p++) {
5421 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005422 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005423 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005424 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005425}
5426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005427PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005428"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005430Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005431False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432
5433static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005434unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435{
5436 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5437 register const Py_UNICODE *e;
5438
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439 /* Shortcut for single character strings */
5440 if (PyUnicode_GET_SIZE(self) == 1 &&
5441 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005442 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005444 /* Special case for empty strings */
5445 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005446 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005447
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 e = p + PyUnicode_GET_SIZE(self);
5449 for (; p < e; p++) {
5450 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005451 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005453 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454}
5455
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005456PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005457"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005459Return True if all characters in S are digits\n\
5460and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461
5462static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005463unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464{
5465 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5466 register const Py_UNICODE *e;
5467
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 /* Shortcut for single character strings */
5469 if (PyUnicode_GET_SIZE(self) == 1 &&
5470 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005471 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005473 /* Special case for empty strings */
5474 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005475 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005476
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 e = p + PyUnicode_GET_SIZE(self);
5478 for (; p < e; p++) {
5479 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005480 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005482 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483}
5484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005485PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005486"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005488Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005489False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490
5491static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005492unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493{
5494 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5495 register const Py_UNICODE *e;
5496
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 /* Shortcut for single character strings */
5498 if (PyUnicode_GET_SIZE(self) == 1 &&
5499 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005500 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005502 /* Special case for empty strings */
5503 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005504 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005505
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 e = p + PyUnicode_GET_SIZE(self);
5507 for (; p < e; p++) {
5508 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005509 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005511 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512}
5513
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005514PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515"S.join(sequence) -> unicode\n\
5516\n\
5517Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005518sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519
5520static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005521unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005523 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524}
5525
5526static int
5527unicode_length(PyUnicodeObject *self)
5528{
5529 return self->length;
5530}
5531
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005532PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005533"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534\n\
5535Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005536done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537
5538static PyObject *
5539unicode_ljust(PyUnicodeObject *self, PyObject *args)
5540{
5541 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005542 Py_UNICODE fillchar = ' ';
5543
5544 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 return NULL;
5546
Tim Peters7a29bd52001-09-12 03:03:31 +00005547 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548 Py_INCREF(self);
5549 return (PyObject*) self;
5550 }
5551
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005552 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553}
5554
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005555PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556"S.lower() -> unicode\n\
5557\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005558Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559
5560static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005561unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563 return fixup(self, fixlower);
5564}
5565
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005566#define LEFTSTRIP 0
5567#define RIGHTSTRIP 1
5568#define BOTHSTRIP 2
5569
5570/* Arrays indexed by above */
5571static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5572
5573#define STRIPNAME(i) (stripformat[i]+3)
5574
5575static const Py_UNICODE *
5576unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5577{
Tim Peters030a5ce2002-04-22 19:00:10 +00005578 size_t i;
5579 for (i = 0; i < n; ++i)
5580 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005581 return s+i;
5582 return NULL;
5583}
5584
5585/* externally visible for str.strip(unicode) */
5586PyObject *
5587_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5588{
5589 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5590 int len = PyUnicode_GET_SIZE(self);
5591 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5592 int seplen = PyUnicode_GET_SIZE(sepobj);
5593 int i, j;
5594
5595 i = 0;
5596 if (striptype != RIGHTSTRIP) {
5597 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5598 i++;
5599 }
5600 }
5601
5602 j = len;
5603 if (striptype != LEFTSTRIP) {
5604 do {
5605 j--;
5606 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5607 j++;
5608 }
5609
5610 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5611 Py_INCREF(self);
5612 return (PyObject*)self;
5613 }
5614 else
5615 return PyUnicode_FromUnicode(s+i, j-i);
5616}
5617
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618
5619static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005620do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005622 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5623 int len = PyUnicode_GET_SIZE(self), i, j;
5624
5625 i = 0;
5626 if (striptype != RIGHTSTRIP) {
5627 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5628 i++;
5629 }
5630 }
5631
5632 j = len;
5633 if (striptype != LEFTSTRIP) {
5634 do {
5635 j--;
5636 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5637 j++;
5638 }
5639
5640 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5641 Py_INCREF(self);
5642 return (PyObject*)self;
5643 }
5644 else
5645 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646}
5647
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005648
5649static PyObject *
5650do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5651{
5652 PyObject *sep = NULL;
5653
5654 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5655 return NULL;
5656
5657 if (sep != NULL && sep != Py_None) {
5658 if (PyUnicode_Check(sep))
5659 return _PyUnicode_XStrip(self, striptype, sep);
5660 else if (PyString_Check(sep)) {
5661 PyObject *res;
5662 sep = PyUnicode_FromObject(sep);
5663 if (sep==NULL)
5664 return NULL;
5665 res = _PyUnicode_XStrip(self, striptype, sep);
5666 Py_DECREF(sep);
5667 return res;
5668 }
5669 else {
5670 PyErr_Format(PyExc_TypeError,
5671 "%s arg must be None, unicode or str",
5672 STRIPNAME(striptype));
5673 return NULL;
5674 }
5675 }
5676
5677 return do_strip(self, striptype);
5678}
5679
5680
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005681PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005682"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005683\n\
5684Return a copy of the string S with leading and trailing\n\
5685whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005686If chars is given and not None, remove characters in chars instead.\n\
5687If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005688
5689static PyObject *
5690unicode_strip(PyUnicodeObject *self, PyObject *args)
5691{
5692 if (PyTuple_GET_SIZE(args) == 0)
5693 return do_strip(self, BOTHSTRIP); /* Common case */
5694 else
5695 return do_argstrip(self, BOTHSTRIP, args);
5696}
5697
5698
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005699PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005700"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005701\n\
5702Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005703If chars is given and not None, remove characters in chars instead.\n\
5704If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005705
5706static PyObject *
5707unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5708{
5709 if (PyTuple_GET_SIZE(args) == 0)
5710 return do_strip(self, LEFTSTRIP); /* Common case */
5711 else
5712 return do_argstrip(self, LEFTSTRIP, args);
5713}
5714
5715
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005716PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005717"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005718\n\
5719Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005720If chars is given and not None, remove characters in chars instead.\n\
5721If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005722
5723static PyObject *
5724unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5725{
5726 if (PyTuple_GET_SIZE(args) == 0)
5727 return do_strip(self, RIGHTSTRIP); /* Common case */
5728 else
5729 return do_argstrip(self, RIGHTSTRIP, args);
5730}
5731
5732
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733static PyObject*
5734unicode_repeat(PyUnicodeObject *str, int len)
5735{
5736 PyUnicodeObject *u;
5737 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005738 int nchars;
5739 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740
5741 if (len < 0)
5742 len = 0;
5743
Tim Peters7a29bd52001-09-12 03:03:31 +00005744 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745 /* no repeat, return original string */
5746 Py_INCREF(str);
5747 return (PyObject*) str;
5748 }
Tim Peters8f422462000-09-09 06:13:41 +00005749
5750 /* ensure # of chars needed doesn't overflow int and # of bytes
5751 * needed doesn't overflow size_t
5752 */
5753 nchars = len * str->length;
5754 if (len && nchars / len != str->length) {
5755 PyErr_SetString(PyExc_OverflowError,
5756 "repeated string is too long");
5757 return NULL;
5758 }
5759 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5760 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5761 PyErr_SetString(PyExc_OverflowError,
5762 "repeated string is too long");
5763 return NULL;
5764 }
5765 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 if (!u)
5767 return NULL;
5768
5769 p = u->str;
5770
5771 while (len-- > 0) {
5772 Py_UNICODE_COPY(p, str->str, str->length);
5773 p += str->length;
5774 }
5775
5776 return (PyObject*) u;
5777}
5778
5779PyObject *PyUnicode_Replace(PyObject *obj,
5780 PyObject *subobj,
5781 PyObject *replobj,
5782 int maxcount)
5783{
5784 PyObject *self;
5785 PyObject *str1;
5786 PyObject *str2;
5787 PyObject *result;
5788
5789 self = PyUnicode_FromObject(obj);
5790 if (self == NULL)
5791 return NULL;
5792 str1 = PyUnicode_FromObject(subobj);
5793 if (str1 == NULL) {
5794 Py_DECREF(self);
5795 return NULL;
5796 }
5797 str2 = PyUnicode_FromObject(replobj);
5798 if (str2 == NULL) {
5799 Py_DECREF(self);
5800 Py_DECREF(str1);
5801 return NULL;
5802 }
Tim Petersced69f82003-09-16 20:30:58 +00005803 result = replace((PyUnicodeObject *)self,
5804 (PyUnicodeObject *)str1,
5805 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806 maxcount);
5807 Py_DECREF(self);
5808 Py_DECREF(str1);
5809 Py_DECREF(str2);
5810 return result;
5811}
5812
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005813PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814"S.replace (old, new[, maxsplit]) -> unicode\n\
5815\n\
5816Return a copy of S with all occurrences of substring\n\
5817old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005818given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819
5820static PyObject*
5821unicode_replace(PyUnicodeObject *self, PyObject *args)
5822{
5823 PyUnicodeObject *str1;
5824 PyUnicodeObject *str2;
5825 int maxcount = -1;
5826 PyObject *result;
5827
5828 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5829 return NULL;
5830 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5831 if (str1 == NULL)
5832 return NULL;
5833 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005834 if (str2 == NULL) {
5835 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005837 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838
5839 result = replace(self, str1, str2, maxcount);
5840
5841 Py_DECREF(str1);
5842 Py_DECREF(str2);
5843 return result;
5844}
5845
5846static
5847PyObject *unicode_repr(PyObject *unicode)
5848{
5849 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5850 PyUnicode_GET_SIZE(unicode),
5851 1);
5852}
5853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005854PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855"S.rfind(sub [,start [,end]]) -> int\n\
5856\n\
5857Return the highest index in S where substring sub is found,\n\
5858such that sub is contained within s[start,end]. Optional\n\
5859arguments start and end are interpreted as in slice notation.\n\
5860\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005861Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862
5863static PyObject *
5864unicode_rfind(PyUnicodeObject *self, PyObject *args)
5865{
5866 PyUnicodeObject *substring;
5867 int start = 0;
5868 int end = INT_MAX;
5869 PyObject *result;
5870
Guido van Rossumb8872e62000-05-09 14:14:27 +00005871 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5872 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 return NULL;
5874 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5875 (PyObject *)substring);
5876 if (substring == NULL)
5877 return NULL;
5878
5879 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5880
5881 Py_DECREF(substring);
5882 return result;
5883}
5884
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005885PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886"S.rindex(sub [,start [,end]]) -> int\n\
5887\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005888Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889
5890static PyObject *
5891unicode_rindex(PyUnicodeObject *self, PyObject *args)
5892{
5893 int result;
5894 PyUnicodeObject *substring;
5895 int start = 0;
5896 int end = INT_MAX;
5897
Guido van Rossumb8872e62000-05-09 14:14:27 +00005898 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5899 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900 return NULL;
5901 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5902 (PyObject *)substring);
5903 if (substring == NULL)
5904 return NULL;
5905
5906 result = findstring(self, substring, start, end, -1);
5907
5908 Py_DECREF(substring);
5909 if (result < 0) {
5910 PyErr_SetString(PyExc_ValueError, "substring not found");
5911 return NULL;
5912 }
5913 return PyInt_FromLong(result);
5914}
5915
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005916PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005917"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918\n\
5919Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005920done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921
5922static PyObject *
5923unicode_rjust(PyUnicodeObject *self, PyObject *args)
5924{
5925 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005926 Py_UNICODE fillchar = ' ';
5927
5928 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 return NULL;
5930
Tim Peters7a29bd52001-09-12 03:03:31 +00005931 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 Py_INCREF(self);
5933 return (PyObject*) self;
5934 }
5935
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005936 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937}
5938
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939static PyObject*
5940unicode_slice(PyUnicodeObject *self, int start, int end)
5941{
5942 /* standard clamping */
5943 if (start < 0)
5944 start = 0;
5945 if (end < 0)
5946 end = 0;
5947 if (end > self->length)
5948 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005949 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 /* full slice, return original string */
5951 Py_INCREF(self);
5952 return (PyObject*) self;
5953 }
5954 if (start > end)
5955 start = end;
5956 /* copy slice */
5957 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5958 end - start);
5959}
5960
5961PyObject *PyUnicode_Split(PyObject *s,
5962 PyObject *sep,
5963 int maxsplit)
5964{
5965 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005966
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 s = PyUnicode_FromObject(s);
5968 if (s == NULL)
5969 return NULL;
5970 if (sep != NULL) {
5971 sep = PyUnicode_FromObject(sep);
5972 if (sep == NULL) {
5973 Py_DECREF(s);
5974 return NULL;
5975 }
5976 }
5977
5978 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5979
5980 Py_DECREF(s);
5981 Py_XDECREF(sep);
5982 return result;
5983}
5984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005985PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986"S.split([sep [,maxsplit]]) -> list of strings\n\
5987\n\
5988Return a list of the words in S, using sep as the\n\
5989delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00005990splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00005991any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992
5993static PyObject*
5994unicode_split(PyUnicodeObject *self, PyObject *args)
5995{
5996 PyObject *substring = Py_None;
5997 int maxcount = -1;
5998
5999 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
6000 return NULL;
6001
6002 if (substring == Py_None)
6003 return split(self, NULL, maxcount);
6004 else if (PyUnicode_Check(substring))
6005 return split(self, (PyUnicodeObject *)substring, maxcount);
6006 else
6007 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6008}
6009
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006010PyObject *PyUnicode_RSplit(PyObject *s,
6011 PyObject *sep,
6012 int maxsplit)
6013{
6014 PyObject *result;
6015
6016 s = PyUnicode_FromObject(s);
6017 if (s == NULL)
6018 return NULL;
6019 if (sep != NULL) {
6020 sep = PyUnicode_FromObject(sep);
6021 if (sep == NULL) {
6022 Py_DECREF(s);
6023 return NULL;
6024 }
6025 }
6026
6027 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6028
6029 Py_DECREF(s);
6030 Py_XDECREF(sep);
6031 return result;
6032}
6033
6034PyDoc_STRVAR(rsplit__doc__,
6035"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6036\n\
6037Return a list of the words in S, using sep as the\n\
6038delimiter string, starting at the end of the string and\n\
6039working to the front. If maxsplit is given, at most maxsplit\n\
6040splits are done. If sep is not specified, any whitespace string\n\
6041is a separator.");
6042
6043static PyObject*
6044unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6045{
6046 PyObject *substring = Py_None;
6047 int maxcount = -1;
6048
6049 if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
6050 return NULL;
6051
6052 if (substring == Py_None)
6053 return rsplit(self, NULL, maxcount);
6054 else if (PyUnicode_Check(substring))
6055 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6056 else
6057 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6058}
6059
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006060PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006061"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062\n\
6063Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006064Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006065is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066
6067static PyObject*
6068unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6069{
Guido van Rossum86662912000-04-11 15:38:46 +00006070 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071
Guido van Rossum86662912000-04-11 15:38:46 +00006072 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073 return NULL;
6074
Guido van Rossum86662912000-04-11 15:38:46 +00006075 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076}
6077
6078static
6079PyObject *unicode_str(PyUnicodeObject *self)
6080{
Fred Drakee4315f52000-05-09 19:53:39 +00006081 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082}
6083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006084PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085"S.swapcase() -> unicode\n\
6086\n\
6087Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006088and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089
6090static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006091unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093 return fixup(self, fixswapcase);
6094}
6095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006096PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097"S.translate(table) -> unicode\n\
6098\n\
6099Return a copy of the string S, where all characters have been mapped\n\
6100through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006101Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6102Unmapped characters are left untouched. Characters mapped to None\n\
6103are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104
6105static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006106unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107{
Tim Petersced69f82003-09-16 20:30:58 +00006108 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006110 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 "ignore");
6112}
6113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006114PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115"S.upper() -> unicode\n\
6116\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006117Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118
6119static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006120unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 return fixup(self, fixupper);
6123}
6124
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006125PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126"S.zfill(width) -> unicode\n\
6127\n\
6128Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006129of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130
6131static PyObject *
6132unicode_zfill(PyUnicodeObject *self, PyObject *args)
6133{
6134 int fill;
6135 PyUnicodeObject *u;
6136
6137 int width;
6138 if (!PyArg_ParseTuple(args, "i:zfill", &width))
6139 return NULL;
6140
6141 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006142 if (PyUnicode_CheckExact(self)) {
6143 Py_INCREF(self);
6144 return (PyObject*) self;
6145 }
6146 else
6147 return PyUnicode_FromUnicode(
6148 PyUnicode_AS_UNICODE(self),
6149 PyUnicode_GET_SIZE(self)
6150 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 }
6152
6153 fill = width - self->length;
6154
6155 u = pad(self, fill, 0, '0');
6156
Walter Dörwald068325e2002-04-15 13:36:47 +00006157 if (u == NULL)
6158 return NULL;
6159
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 if (u->str[fill] == '+' || u->str[fill] == '-') {
6161 /* move sign to beginning of string */
6162 u->str[0] = u->str[fill];
6163 u->str[fill] = '0';
6164 }
6165
6166 return (PyObject*) u;
6167}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168
6169#if 0
6170static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006171unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 return PyInt_FromLong(unicode_freelist_size);
6174}
6175#endif
6176
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006177PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006178"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006180Return True if S starts with the specified prefix, False otherwise.\n\
6181With optional start, test S beginning at that position.\n\
6182With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183
6184static PyObject *
6185unicode_startswith(PyUnicodeObject *self,
6186 PyObject *args)
6187{
6188 PyUnicodeObject *substring;
6189 int start = 0;
6190 int end = INT_MAX;
6191 PyObject *result;
6192
Guido van Rossumb8872e62000-05-09 14:14:27 +00006193 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6194 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195 return NULL;
6196 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6197 (PyObject *)substring);
6198 if (substring == NULL)
6199 return NULL;
6200
Guido van Rossum77f6a652002-04-03 22:41:51 +00006201 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202
6203 Py_DECREF(substring);
6204 return result;
6205}
6206
6207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006208PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006209"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006211Return True if S ends with the specified suffix, False otherwise.\n\
6212With optional start, test S beginning at that position.\n\
6213With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214
6215static PyObject *
6216unicode_endswith(PyUnicodeObject *self,
6217 PyObject *args)
6218{
6219 PyUnicodeObject *substring;
6220 int start = 0;
6221 int end = INT_MAX;
6222 PyObject *result;
6223
Guido van Rossumb8872e62000-05-09 14:14:27 +00006224 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6225 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 return NULL;
6227 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6228 (PyObject *)substring);
6229 if (substring == NULL)
6230 return NULL;
6231
Guido van Rossum77f6a652002-04-03 22:41:51 +00006232 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233
6234 Py_DECREF(substring);
6235 return result;
6236}
6237
6238
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006239
6240static PyObject *
6241unicode_getnewargs(PyUnicodeObject *v)
6242{
6243 return Py_BuildValue("(u#)", v->str, v->length);
6244}
6245
6246
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247static PyMethodDef unicode_methods[] = {
6248
6249 /* Order is according to common usage: often used methods should
6250 appear first, since lookup is done sequentially. */
6251
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006252 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6253 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6254 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006255 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006256 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6257 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6258 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6259 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6260 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6261 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6262 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6263 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6264 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6265 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006266 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006267 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006268/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6269 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6270 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6271 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006272 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006273 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006274 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006275 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6276 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6277 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6278 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6279 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6280 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6281 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6282 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6283 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6284 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6285 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6286 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6287 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6288 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006289 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006290#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006291 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292#endif
6293
6294#if 0
6295 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006296 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297#endif
6298
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006299 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 {NULL, NULL}
6301};
6302
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006303static PyObject *
6304unicode_mod(PyObject *v, PyObject *w)
6305{
6306 if (!PyUnicode_Check(v)) {
6307 Py_INCREF(Py_NotImplemented);
6308 return Py_NotImplemented;
6309 }
6310 return PyUnicode_Format(v, w);
6311}
6312
6313static PyNumberMethods unicode_as_number = {
6314 0, /*nb_add*/
6315 0, /*nb_subtract*/
6316 0, /*nb_multiply*/
6317 0, /*nb_divide*/
6318 unicode_mod, /*nb_remainder*/
6319};
6320
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321static PySequenceMethods unicode_as_sequence = {
6322 (inquiry) unicode_length, /* sq_length */
6323 (binaryfunc) PyUnicode_Concat, /* sq_concat */
6324 (intargfunc) unicode_repeat, /* sq_repeat */
6325 (intargfunc) unicode_getitem, /* sq_item */
6326 (intintargfunc) unicode_slice, /* sq_slice */
6327 0, /* sq_ass_item */
6328 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00006329 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330};
6331
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006332static PyObject*
6333unicode_subscript(PyUnicodeObject* self, PyObject* item)
6334{
6335 if (PyInt_Check(item)) {
6336 long i = PyInt_AS_LONG(item);
6337 if (i < 0)
6338 i += PyString_GET_SIZE(self);
6339 return unicode_getitem(self, i);
6340 } else if (PyLong_Check(item)) {
6341 long i = PyLong_AsLong(item);
6342 if (i == -1 && PyErr_Occurred())
6343 return NULL;
6344 if (i < 0)
6345 i += PyString_GET_SIZE(self);
6346 return unicode_getitem(self, i);
6347 } else if (PySlice_Check(item)) {
6348 int start, stop, step, slicelength, cur, i;
6349 Py_UNICODE* source_buf;
6350 Py_UNICODE* result_buf;
6351 PyObject* result;
6352
6353 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
6354 &start, &stop, &step, &slicelength) < 0) {
6355 return NULL;
6356 }
6357
6358 if (slicelength <= 0) {
6359 return PyUnicode_FromUnicode(NULL, 0);
6360 } else {
6361 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6362 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
6363
6364 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6365 result_buf[i] = source_buf[cur];
6366 }
Tim Petersced69f82003-09-16 20:30:58 +00006367
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006368 result = PyUnicode_FromUnicode(result_buf, slicelength);
6369 PyMem_FREE(result_buf);
6370 return result;
6371 }
6372 } else {
6373 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6374 return NULL;
6375 }
6376}
6377
6378static PyMappingMethods unicode_as_mapping = {
6379 (inquiry)unicode_length, /* mp_length */
6380 (binaryfunc)unicode_subscript, /* mp_subscript */
6381 (objobjargproc)0, /* mp_ass_subscript */
6382};
6383
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384static int
6385unicode_buffer_getreadbuf(PyUnicodeObject *self,
6386 int index,
6387 const void **ptr)
6388{
6389 if (index != 0) {
6390 PyErr_SetString(PyExc_SystemError,
6391 "accessing non-existent unicode segment");
6392 return -1;
6393 }
6394 *ptr = (void *) self->str;
6395 return PyUnicode_GET_DATA_SIZE(self);
6396}
6397
6398static int
6399unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6400 const void **ptr)
6401{
6402 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006403 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404 return -1;
6405}
6406
6407static int
6408unicode_buffer_getsegcount(PyUnicodeObject *self,
6409 int *lenp)
6410{
6411 if (lenp)
6412 *lenp = PyUnicode_GET_DATA_SIZE(self);
6413 return 1;
6414}
6415
6416static int
6417unicode_buffer_getcharbuf(PyUnicodeObject *self,
6418 int index,
6419 const void **ptr)
6420{
6421 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006422
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423 if (index != 0) {
6424 PyErr_SetString(PyExc_SystemError,
6425 "accessing non-existent unicode segment");
6426 return -1;
6427 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006428 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 if (str == NULL)
6430 return -1;
6431 *ptr = (void *) PyString_AS_STRING(str);
6432 return PyString_GET_SIZE(str);
6433}
6434
6435/* Helpers for PyUnicode_Format() */
6436
6437static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006438getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439{
6440 int argidx = *p_argidx;
6441 if (argidx < arglen) {
6442 (*p_argidx)++;
6443 if (arglen < 0)
6444 return args;
6445 else
6446 return PyTuple_GetItem(args, argidx);
6447 }
6448 PyErr_SetString(PyExc_TypeError,
6449 "not enough arguments for format string");
6450 return NULL;
6451}
6452
6453#define F_LJUST (1<<0)
6454#define F_SIGN (1<<1)
6455#define F_BLANK (1<<2)
6456#define F_ALT (1<<3)
6457#define F_ZERO (1<<4)
6458
6459static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461{
6462 register int i;
6463 int len;
6464 va_list va;
6465 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467
6468 /* First, format the string as char array, then expand to Py_UNICODE
6469 array. */
6470 charbuffer = (char *)buffer;
6471 len = vsprintf(charbuffer, format, va);
6472 for (i = len - 1; i >= 0; i--)
6473 buffer[i] = (Py_UNICODE) charbuffer[i];
6474
6475 va_end(va);
6476 return len;
6477}
6478
Guido van Rossum078151d2002-08-11 04:24:12 +00006479/* XXX To save some code duplication, formatfloat/long/int could have been
6480 shared with stringobject.c, converting from 8-bit to Unicode after the
6481 formatting is done. */
6482
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483static int
6484formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006485 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 int flags,
6487 int prec,
6488 int type,
6489 PyObject *v)
6490{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006491 /* fmt = '%#.' + `prec` + `type`
6492 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493 char fmt[20];
6494 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006495
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496 x = PyFloat_AsDouble(v);
6497 if (x == -1.0 && PyErr_Occurred())
6498 return -1;
6499 if (prec < 0)
6500 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6502 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006503 /* Worst case length calc to ensure no buffer overrun:
6504
6505 'g' formats:
6506 fmt = %#.<prec>g
6507 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6508 for any double rep.)
6509 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6510
6511 'f' formats:
6512 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6513 len = 1 + 50 + 1 + prec = 52 + prec
6514
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006515 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006516 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006517
6518 */
6519 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6520 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006521 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006522 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006523 return -1;
6524 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006525 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6526 (flags&F_ALT) ? "#" : "",
6527 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528 return usprintf(buf, fmt, x);
6529}
6530
Tim Peters38fd5b62000-09-21 05:43:11 +00006531static PyObject*
6532formatlong(PyObject *val, int flags, int prec, int type)
6533{
6534 char *buf;
6535 int i, len;
6536 PyObject *str; /* temporary string object. */
6537 PyUnicodeObject *result;
6538
6539 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6540 if (!str)
6541 return NULL;
6542 result = _PyUnicode_New(len);
6543 for (i = 0; i < len; i++)
6544 result->str[i] = buf[i];
6545 result->str[len] = 0;
6546 Py_DECREF(str);
6547 return (PyObject*)result;
6548}
6549
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550static int
6551formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006552 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553 int flags,
6554 int prec,
6555 int type,
6556 PyObject *v)
6557{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006558 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006559 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6560 * + 1 + 1
6561 * = 24
6562 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006563 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006564 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565 long x;
6566
6567 x = PyInt_AsLong(v);
6568 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006569 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006570 if (x < 0 && type == 'u') {
6571 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006572 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006573 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6574 sign = "-";
6575 else
6576 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006578 prec = 1;
6579
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006580 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6581 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006582 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006583 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006584 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006585 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006586 return -1;
6587 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006588
6589 if ((flags & F_ALT) &&
6590 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006591 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006592 * of issues that cause pain:
6593 * - when 0 is being converted, the C standard leaves off
6594 * the '0x' or '0X', which is inconsistent with other
6595 * %#x/%#X conversions and inconsistent with Python's
6596 * hex() function
6597 * - there are platforms that violate the standard and
6598 * convert 0 with the '0x' or '0X'
6599 * (Metrowerks, Compaq Tru64)
6600 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006601 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006602 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006603 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006604 * We can achieve the desired consistency by inserting our
6605 * own '0x' or '0X' prefix, and substituting %x/%X in place
6606 * of %#x/%#X.
6607 *
6608 * Note that this is the same approach as used in
6609 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006610 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006611 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6612 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006613 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006614 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006615 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6616 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006617 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006618 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006619 if (sign[0])
6620 return usprintf(buf, fmt, -x);
6621 else
6622 return usprintf(buf, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623}
6624
6625static int
6626formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006627 size_t buflen,
6628 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006630 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006631 if (PyUnicode_Check(v)) {
6632 if (PyUnicode_GET_SIZE(v) != 1)
6633 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006635 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006637 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006638 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006639 goto onError;
6640 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642
6643 else {
6644 /* Integer input truncated to a character */
6645 long x;
6646 x = PyInt_AsLong(v);
6647 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006648 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006649#ifdef Py_UNICODE_WIDE
6650 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006651 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006652 "%c arg not in range(0x110000) "
6653 "(wide Python build)");
6654 return -1;
6655 }
6656#else
6657 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006658 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006659 "%c arg not in range(0x10000) "
6660 "(narrow Python build)");
6661 return -1;
6662 }
6663#endif
6664 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665 }
6666 buf[1] = '\0';
6667 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006668
6669 onError:
6670 PyErr_SetString(PyExc_TypeError,
6671 "%c requires int or char");
6672 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673}
6674
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006675/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6676
6677 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6678 chars are formatted. XXX This is a magic number. Each formatting
6679 routine does bounds checking to ensure no overflow, but a better
6680 solution may be to malloc a buffer of appropriate size for each
6681 format. For now, the current solution is sufficient.
6682*/
6683#define FORMATBUFLEN (size_t)120
6684
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685PyObject *PyUnicode_Format(PyObject *format,
6686 PyObject *args)
6687{
6688 Py_UNICODE *fmt, *res;
6689 int fmtcnt, rescnt, reslen, arglen, argidx;
6690 int args_owned = 0;
6691 PyUnicodeObject *result = NULL;
6692 PyObject *dict = NULL;
6693 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006694
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 if (format == NULL || args == NULL) {
6696 PyErr_BadInternalCall();
6697 return NULL;
6698 }
6699 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006700 if (uformat == NULL)
6701 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702 fmt = PyUnicode_AS_UNICODE(uformat);
6703 fmtcnt = PyUnicode_GET_SIZE(uformat);
6704
6705 reslen = rescnt = fmtcnt + 100;
6706 result = _PyUnicode_New(reslen);
6707 if (result == NULL)
6708 goto onError;
6709 res = PyUnicode_AS_UNICODE(result);
6710
6711 if (PyTuple_Check(args)) {
6712 arglen = PyTuple_Size(args);
6713 argidx = 0;
6714 }
6715 else {
6716 arglen = -1;
6717 argidx = -2;
6718 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006719 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6720 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721 dict = args;
6722
6723 while (--fmtcnt >= 0) {
6724 if (*fmt != '%') {
6725 if (--rescnt < 0) {
6726 rescnt = fmtcnt + 100;
6727 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006728 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 return NULL;
6730 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6731 --rescnt;
6732 }
6733 *res++ = *fmt++;
6734 }
6735 else {
6736 /* Got a format specifier */
6737 int flags = 0;
6738 int width = -1;
6739 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 Py_UNICODE c = '\0';
6741 Py_UNICODE fill;
6742 PyObject *v = NULL;
6743 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006744 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745 Py_UNICODE sign;
6746 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006747 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748
6749 fmt++;
6750 if (*fmt == '(') {
6751 Py_UNICODE *keystart;
6752 int keylen;
6753 PyObject *key;
6754 int pcount = 1;
6755
6756 if (dict == NULL) {
6757 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006758 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759 goto onError;
6760 }
6761 ++fmt;
6762 --fmtcnt;
6763 keystart = fmt;
6764 /* Skip over balanced parentheses */
6765 while (pcount > 0 && --fmtcnt >= 0) {
6766 if (*fmt == ')')
6767 --pcount;
6768 else if (*fmt == '(')
6769 ++pcount;
6770 fmt++;
6771 }
6772 keylen = fmt - keystart - 1;
6773 if (fmtcnt < 0 || pcount > 0) {
6774 PyErr_SetString(PyExc_ValueError,
6775 "incomplete format key");
6776 goto onError;
6777 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006778#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006779 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780 then looked up since Python uses strings to hold
6781 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006782 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783 key = PyUnicode_EncodeUTF8(keystart,
6784 keylen,
6785 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006786#else
6787 key = PyUnicode_FromUnicode(keystart, keylen);
6788#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789 if (key == NULL)
6790 goto onError;
6791 if (args_owned) {
6792 Py_DECREF(args);
6793 args_owned = 0;
6794 }
6795 args = PyObject_GetItem(dict, key);
6796 Py_DECREF(key);
6797 if (args == NULL) {
6798 goto onError;
6799 }
6800 args_owned = 1;
6801 arglen = -1;
6802 argidx = -2;
6803 }
6804 while (--fmtcnt >= 0) {
6805 switch (c = *fmt++) {
6806 case '-': flags |= F_LJUST; continue;
6807 case '+': flags |= F_SIGN; continue;
6808 case ' ': flags |= F_BLANK; continue;
6809 case '#': flags |= F_ALT; continue;
6810 case '0': flags |= F_ZERO; continue;
6811 }
6812 break;
6813 }
6814 if (c == '*') {
6815 v = getnextarg(args, arglen, &argidx);
6816 if (v == NULL)
6817 goto onError;
6818 if (!PyInt_Check(v)) {
6819 PyErr_SetString(PyExc_TypeError,
6820 "* wants int");
6821 goto onError;
6822 }
6823 width = PyInt_AsLong(v);
6824 if (width < 0) {
6825 flags |= F_LJUST;
6826 width = -width;
6827 }
6828 if (--fmtcnt >= 0)
6829 c = *fmt++;
6830 }
6831 else if (c >= '0' && c <= '9') {
6832 width = c - '0';
6833 while (--fmtcnt >= 0) {
6834 c = *fmt++;
6835 if (c < '0' || c > '9')
6836 break;
6837 if ((width*10) / 10 != width) {
6838 PyErr_SetString(PyExc_ValueError,
6839 "width too big");
6840 goto onError;
6841 }
6842 width = width*10 + (c - '0');
6843 }
6844 }
6845 if (c == '.') {
6846 prec = 0;
6847 if (--fmtcnt >= 0)
6848 c = *fmt++;
6849 if (c == '*') {
6850 v = getnextarg(args, arglen, &argidx);
6851 if (v == NULL)
6852 goto onError;
6853 if (!PyInt_Check(v)) {
6854 PyErr_SetString(PyExc_TypeError,
6855 "* wants int");
6856 goto onError;
6857 }
6858 prec = PyInt_AsLong(v);
6859 if (prec < 0)
6860 prec = 0;
6861 if (--fmtcnt >= 0)
6862 c = *fmt++;
6863 }
6864 else if (c >= '0' && c <= '9') {
6865 prec = c - '0';
6866 while (--fmtcnt >= 0) {
6867 c = Py_CHARMASK(*fmt++);
6868 if (c < '0' || c > '9')
6869 break;
6870 if ((prec*10) / 10 != prec) {
6871 PyErr_SetString(PyExc_ValueError,
6872 "prec too big");
6873 goto onError;
6874 }
6875 prec = prec*10 + (c - '0');
6876 }
6877 }
6878 } /* prec */
6879 if (fmtcnt >= 0) {
6880 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881 if (--fmtcnt >= 0)
6882 c = *fmt++;
6883 }
6884 }
6885 if (fmtcnt < 0) {
6886 PyErr_SetString(PyExc_ValueError,
6887 "incomplete format");
6888 goto onError;
6889 }
6890 if (c != '%') {
6891 v = getnextarg(args, arglen, &argidx);
6892 if (v == NULL)
6893 goto onError;
6894 }
6895 sign = 0;
6896 fill = ' ';
6897 switch (c) {
6898
6899 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006900 pbuf = formatbuf;
6901 /* presume that buffer length is at least 1 */
6902 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903 len = 1;
6904 break;
6905
6906 case 's':
6907 case 'r':
6908 if (PyUnicode_Check(v) && c == 's') {
6909 temp = v;
6910 Py_INCREF(temp);
6911 }
6912 else {
6913 PyObject *unicode;
6914 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006915 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 else
6917 temp = PyObject_Repr(v);
6918 if (temp == NULL)
6919 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006920 if (PyUnicode_Check(temp))
6921 /* nothing to do */;
6922 else if (PyString_Check(temp)) {
6923 /* convert to string to Unicode */
Fred Drakee4315f52000-05-09 19:53:39 +00006924 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006926 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927 "strict");
6928 Py_DECREF(temp);
6929 temp = unicode;
6930 if (temp == NULL)
6931 goto onError;
6932 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006933 else {
6934 Py_DECREF(temp);
6935 PyErr_SetString(PyExc_TypeError,
6936 "%s argument has non-string str()");
6937 goto onError;
6938 }
6939 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006940 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 len = PyUnicode_GET_SIZE(temp);
6942 if (prec >= 0 && len > prec)
6943 len = prec;
6944 break;
6945
6946 case 'i':
6947 case 'd':
6948 case 'u':
6949 case 'o':
6950 case 'x':
6951 case 'X':
6952 if (c == 'i')
6953 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006954 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006955 temp = formatlong(v, flags, prec, c);
6956 if (!temp)
6957 goto onError;
6958 pbuf = PyUnicode_AS_UNICODE(temp);
6959 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00006960 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006962 else {
6963 pbuf = formatbuf;
6964 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6965 flags, prec, c, v);
6966 if (len < 0)
6967 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006968 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006969 }
6970 if (flags & F_ZERO)
6971 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 break;
6973
6974 case 'e':
6975 case 'E':
6976 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006977 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978 case 'g':
6979 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006980 if (c == 'F')
6981 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006982 pbuf = formatbuf;
6983 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6984 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985 if (len < 0)
6986 goto onError;
6987 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006988 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989 fill = '0';
6990 break;
6991
6992 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006993 pbuf = formatbuf;
6994 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 if (len < 0)
6996 goto onError;
6997 break;
6998
6999 default:
7000 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007001 "unsupported format character '%c' (0x%x) "
7002 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007003 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007004 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007005 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006 goto onError;
7007 }
7008 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007009 if (*pbuf == '-' || *pbuf == '+') {
7010 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 len--;
7012 }
7013 else if (flags & F_SIGN)
7014 sign = '+';
7015 else if (flags & F_BLANK)
7016 sign = ' ';
7017 else
7018 sign = 0;
7019 }
7020 if (width < len)
7021 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007022 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023 reslen -= rescnt;
7024 rescnt = width + fmtcnt + 100;
7025 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007026 if (reslen < 0) {
7027 Py_DECREF(result);
7028 return PyErr_NoMemory();
7029 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007030 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031 return NULL;
7032 res = PyUnicode_AS_UNICODE(result)
7033 + reslen - rescnt;
7034 }
7035 if (sign) {
7036 if (fill != ' ')
7037 *res++ = sign;
7038 rescnt--;
7039 if (width > len)
7040 width--;
7041 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007042 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7043 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007044 assert(pbuf[1] == c);
7045 if (fill != ' ') {
7046 *res++ = *pbuf++;
7047 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007048 }
Tim Petersfff53252001-04-12 18:38:48 +00007049 rescnt -= 2;
7050 width -= 2;
7051 if (width < 0)
7052 width = 0;
7053 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007054 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055 if (width > len && !(flags & F_LJUST)) {
7056 do {
7057 --rescnt;
7058 *res++ = fill;
7059 } while (--width > len);
7060 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007061 if (fill == ' ') {
7062 if (sign)
7063 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007064 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007065 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007066 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007067 *res++ = *pbuf++;
7068 *res++ = *pbuf++;
7069 }
7070 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007071 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072 res += len;
7073 rescnt -= len;
7074 while (--width >= len) {
7075 --rescnt;
7076 *res++ = ' ';
7077 }
7078 if (dict && (argidx < arglen) && c != '%') {
7079 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007080 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081 goto onError;
7082 }
7083 Py_XDECREF(temp);
7084 } /* '%' */
7085 } /* until end */
7086 if (argidx < arglen && !dict) {
7087 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007088 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089 goto onError;
7090 }
7091
7092 if (args_owned) {
7093 Py_DECREF(args);
7094 }
7095 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00007096 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007097 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098 return (PyObject *)result;
7099
7100 onError:
7101 Py_XDECREF(result);
7102 Py_DECREF(uformat);
7103 if (args_owned) {
7104 Py_DECREF(args);
7105 }
7106 return NULL;
7107}
7108
7109static PyBufferProcs unicode_as_buffer = {
7110 (getreadbufferproc) unicode_buffer_getreadbuf,
7111 (getwritebufferproc) unicode_buffer_getwritebuf,
7112 (getsegcountproc) unicode_buffer_getsegcount,
7113 (getcharbufferproc) unicode_buffer_getcharbuf,
7114};
7115
Jeremy Hylton938ace62002-07-17 16:30:39 +00007116static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007117unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7118
Tim Peters6d6c1a32001-08-02 04:15:00 +00007119static PyObject *
7120unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7121{
7122 PyObject *x = NULL;
7123 static char *kwlist[] = {"string", "encoding", "errors", 0};
7124 char *encoding = NULL;
7125 char *errors = NULL;
7126
Guido van Rossume023fe02001-08-30 03:12:59 +00007127 if (type != &PyUnicode_Type)
7128 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007129 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7130 kwlist, &x, &encoding, &errors))
7131 return NULL;
7132 if (x == NULL)
7133 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007134 if (encoding == NULL && errors == NULL)
7135 return PyObject_Unicode(x);
7136 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007137 return PyUnicode_FromEncodedObject(x, encoding, errors);
7138}
7139
Guido van Rossume023fe02001-08-30 03:12:59 +00007140static PyObject *
7141unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7142{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007143 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007144 int n;
7145
7146 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7147 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7148 if (tmp == NULL)
7149 return NULL;
7150 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007151 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007152 if (pnew == NULL) {
7153 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007154 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007155 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007156 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7157 if (pnew->str == NULL) {
7158 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007159 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007160 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007161 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007162 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007163 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7164 pnew->length = n;
7165 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007166 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007167 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007168}
7169
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007170PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007171"unicode(string [, encoding[, errors]]) -> object\n\
7172\n\
7173Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007174encoding defaults to the current default string encoding.\n\
7175errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007176
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177PyTypeObject PyUnicode_Type = {
7178 PyObject_HEAD_INIT(&PyType_Type)
7179 0, /* ob_size */
7180 "unicode", /* tp_name */
7181 sizeof(PyUnicodeObject), /* tp_size */
7182 0, /* tp_itemsize */
7183 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007184 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007186 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187 0, /* tp_setattr */
7188 (cmpfunc) unicode_compare, /* tp_compare */
7189 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007190 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007192 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193 (hashfunc) unicode_hash, /* tp_hash*/
7194 0, /* tp_call*/
7195 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007196 PyObject_GenericGetAttr, /* tp_getattro */
7197 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007199 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7200 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007201 unicode_doc, /* tp_doc */
7202 0, /* tp_traverse */
7203 0, /* tp_clear */
7204 0, /* tp_richcompare */
7205 0, /* tp_weaklistoffset */
7206 0, /* tp_iter */
7207 0, /* tp_iternext */
7208 unicode_methods, /* tp_methods */
7209 0, /* tp_members */
7210 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007211 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007212 0, /* tp_dict */
7213 0, /* tp_descr_get */
7214 0, /* tp_descr_set */
7215 0, /* tp_dictoffset */
7216 0, /* tp_init */
7217 0, /* tp_alloc */
7218 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007219 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220};
7221
7222/* Initialize the Unicode implementation */
7223
Thomas Wouters78890102000-07-22 19:25:51 +00007224void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007226 int i;
7227
Fred Drakee4315f52000-05-09 19:53:39 +00007228 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007229 unicode_freelist = NULL;
7230 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007232 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007233 for (i = 0; i < 256; i++)
7234 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007235 if (PyType_Ready(&PyUnicode_Type) < 0)
7236 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237}
7238
7239/* Finalize the Unicode implementation */
7240
7241void
Thomas Wouters78890102000-07-22 19:25:51 +00007242_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007244 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007245 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007247 Py_XDECREF(unicode_empty);
7248 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007249
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007250 for (i = 0; i < 256; i++) {
7251 if (unicode_latin1[i]) {
7252 Py_DECREF(unicode_latin1[i]);
7253 unicode_latin1[i] = NULL;
7254 }
7255 }
7256
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007257 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258 PyUnicodeObject *v = u;
7259 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007260 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007261 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007262 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007263 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007265 unicode_freelist = NULL;
7266 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007268
7269/*
7270Local variables:
7271c-basic-offset: 4
7272indent-tabs-mode: nil
7273End:
7274*/