blob: 83104d802f31cb8dd53da5b355c985c82aea2f2b [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
157 unicode->length = length;
158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000266int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
371 for (i = size; i >= 0; i--)
372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
387 if (size > PyUnicode_GET_SIZE(unicode))
388 size = PyUnicode_GET_SIZE(unicode);
389#ifdef HAVE_USABLE_WCHAR_T
390 memcpy(w, unicode->str, size * sizeof(wchar_t));
391#else
392 {
393 register Py_UNICODE *u;
394 register int i;
395 u = PyUnicode_AS_UNICODE(unicode);
396 for (i = size; i >= 0; i--)
397 *w++ = *u++;
398 }
399#endif
400
401 return size;
402}
403
404#endif
405
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000406PyObject *PyUnicode_FromOrdinal(int ordinal)
407{
408 Py_UNICODE s[2];
409
410#ifdef Py_UNICODE_WIDE
411 if (ordinal < 0 || ordinal > 0x10ffff) {
412 PyErr_SetString(PyExc_ValueError,
413 "unichr() arg not in range(0x110000) "
414 "(wide Python build)");
415 return NULL;
416 }
417#else
418 if (ordinal < 0 || ordinal > 0xffff) {
419 PyErr_SetString(PyExc_ValueError,
420 "unichr() arg not in range(0x10000) "
421 "(narrow Python build)");
422 return NULL;
423 }
424#endif
425
426 if (ordinal <= 0xffff) {
427 /* UCS-2 character */
428 s[0] = (Py_UNICODE) ordinal;
429 return PyUnicode_FromUnicode(s, 1);
430 }
431 else {
432#ifndef Py_UNICODE_WIDE
433 /* UCS-4 character. store as two surrogate characters */
434 ordinal -= 0x10000L;
435 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
436 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
437 return PyUnicode_FromUnicode(s, 2);
438#else
439 s[0] = (Py_UNICODE)ordinal;
440 return PyUnicode_FromUnicode(s, 1);
441#endif
442 }
443}
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445PyObject *PyUnicode_FromObject(register PyObject *obj)
446{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000447 /* XXX Perhaps we should make this API an alias of
448 PyObject_Unicode() instead ?! */
449 if (PyUnicode_CheckExact(obj)) {
450 Py_INCREF(obj);
451 return obj;
452 }
453 if (PyUnicode_Check(obj)) {
454 /* For a Unicode subtype that's not a Unicode object,
455 return a true Unicode object with the same data. */
456 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
457 PyUnicode_GET_SIZE(obj));
458 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
460}
461
462PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
463 const char *encoding,
464 const char *errors)
465{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000466 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000468 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000469
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470 if (obj == NULL) {
471 PyErr_BadInternalCall();
472 return NULL;
473 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000474
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000475#if 0
476 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000477 that no encodings is given and then redirect to
478 PyObject_Unicode() which then applies the additional logic for
479 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000480
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000481 NOTE: This API should really only be used for object which
482 represent *encoded* Unicode !
483
484 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000485 if (PyUnicode_Check(obj)) {
486 if (encoding) {
487 PyErr_SetString(PyExc_TypeError,
488 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000489 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000490 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000492 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000493#else
494 if (PyUnicode_Check(obj)) {
495 PyErr_SetString(PyExc_TypeError,
496 "decoding Unicode is not supported");
497 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000498 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000499#endif
500
501 /* Coerce object */
502 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000503 s = PyString_AS_STRING(obj);
504 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000505 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000506 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
507 /* Overwrite the error message with something more useful in
508 case of a TypeError. */
509 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000511 "coercing to Unicode: need string or buffer, "
512 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000513 obj->ob_type->tp_name);
514 goto onError;
515 }
Tim Petersced69f82003-09-16 20:30:58 +0000516
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000517 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000518 if (len == 0) {
519 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000520 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000521 }
Tim Petersced69f82003-09-16 20:30:58 +0000522 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000523 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000524
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000525 return v;
526
527 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000528 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000529}
530
531PyObject *PyUnicode_Decode(const char *s,
532 int size,
533 const char *encoding,
534 const char *errors)
535{
536 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000537
538 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000539 encoding = PyUnicode_GetDefaultEncoding();
540
541 /* Shortcuts for common default encodings */
542 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000544 else if (strcmp(encoding, "latin-1") == 0)
545 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000546#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
547 else if (strcmp(encoding, "mbcs") == 0)
548 return PyUnicode_DecodeMBCS(s, size, errors);
549#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000550 else if (strcmp(encoding, "ascii") == 0)
551 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000552
553 /* Decode via the codec registry */
554 buffer = PyBuffer_FromMemory((void *)s, size);
555 if (buffer == NULL)
556 goto onError;
557 unicode = PyCodec_Decode(buffer, encoding, errors);
558 if (unicode == NULL)
559 goto onError;
560 if (!PyUnicode_Check(unicode)) {
561 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000562 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000563 unicode->ob_type->tp_name);
564 Py_DECREF(unicode);
565 goto onError;
566 }
567 Py_DECREF(buffer);
568 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000569
Guido van Rossumd57fd912000-03-10 22:53:23 +0000570 onError:
571 Py_XDECREF(buffer);
572 return NULL;
573}
574
575PyObject *PyUnicode_Encode(const Py_UNICODE *s,
576 int size,
577 const char *encoding,
578 const char *errors)
579{
580 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000581
Guido van Rossumd57fd912000-03-10 22:53:23 +0000582 unicode = PyUnicode_FromUnicode(s, size);
583 if (unicode == NULL)
584 return NULL;
585 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
586 Py_DECREF(unicode);
587 return v;
588}
589
590PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
591 const char *encoding,
592 const char *errors)
593{
594 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596 if (!PyUnicode_Check(unicode)) {
597 PyErr_BadArgument();
598 goto onError;
599 }
Fred Drakee4315f52000-05-09 19:53:39 +0000600
Tim Petersced69f82003-09-16 20:30:58 +0000601 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000602 encoding = PyUnicode_GetDefaultEncoding();
603
604 /* Shortcuts for common default encodings */
605 if (errors == NULL) {
606 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000607 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000608 else if (strcmp(encoding, "latin-1") == 0)
609 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000610#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
611 else if (strcmp(encoding, "mbcs") == 0)
612 return PyUnicode_AsMBCSString(unicode);
613#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000614 else if (strcmp(encoding, "ascii") == 0)
615 return PyUnicode_AsASCIIString(unicode);
616 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617
618 /* Encode via the codec registry */
619 v = PyCodec_Encode(unicode, encoding, errors);
620 if (v == NULL)
621 goto onError;
622 /* XXX Should we really enforce this ? */
623 if (!PyString_Check(v)) {
624 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000625 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 v->ob_type->tp_name);
627 Py_DECREF(v);
628 goto onError;
629 }
630 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000631
Guido van Rossumd57fd912000-03-10 22:53:23 +0000632 onError:
633 return NULL;
634}
635
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000636PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
637 const char *errors)
638{
639 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
640
641 if (v)
642 return v;
643 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
644 if (v && errors == NULL)
645 ((PyUnicodeObject *)unicode)->defenc = v;
646 return v;
647}
648
Guido van Rossumd57fd912000-03-10 22:53:23 +0000649Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
650{
651 if (!PyUnicode_Check(unicode)) {
652 PyErr_BadArgument();
653 goto onError;
654 }
655 return PyUnicode_AS_UNICODE(unicode);
656
657 onError:
658 return NULL;
659}
660
661int PyUnicode_GetSize(PyObject *unicode)
662{
663 if (!PyUnicode_Check(unicode)) {
664 PyErr_BadArgument();
665 goto onError;
666 }
667 return PyUnicode_GET_SIZE(unicode);
668
669 onError:
670 return -1;
671}
672
Thomas Wouters78890102000-07-22 19:25:51 +0000673const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000674{
675 return unicode_default_encoding;
676}
677
678int PyUnicode_SetDefaultEncoding(const char *encoding)
679{
680 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000681
Fred Drakee4315f52000-05-09 19:53:39 +0000682 /* Make sure the encoding is valid. As side effect, this also
683 loads the encoding into the codec registry cache. */
684 v = _PyCodec_Lookup(encoding);
685 if (v == NULL)
686 goto onError;
687 Py_DECREF(v);
688 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000689 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000690 sizeof(unicode_default_encoding));
691 return 0;
692
693 onError:
694 return -1;
695}
696
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000697/* error handling callback helper:
698 build arguments, call the callback and check the arguments,
699 if no exception occured, copy the replacement to the output
700 and adjust various state variables.
701 return 0 on success, -1 on error
702*/
703
704static
705int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
706 const char *encoding, const char *reason,
707 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
708 PyObject **output, int *outpos, Py_UNICODE **outptr)
709{
710 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
711
712 PyObject *restuple = NULL;
713 PyObject *repunicode = NULL;
714 int outsize = PyUnicode_GET_SIZE(*output);
715 int requiredsize;
716 int newpos;
717 Py_UNICODE *repptr;
718 int repsize;
719 int res = -1;
720
721 if (*errorHandler == NULL) {
722 *errorHandler = PyCodec_LookupError(errors);
723 if (*errorHandler == NULL)
724 goto onError;
725 }
726
727 if (*exceptionObject == NULL) {
728 *exceptionObject = PyUnicodeDecodeError_Create(
729 encoding, input, insize, *startinpos, *endinpos, reason);
730 if (*exceptionObject == NULL)
731 goto onError;
732 }
733 else {
734 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
735 goto onError;
736 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
737 goto onError;
738 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
739 goto onError;
740 }
741
742 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
743 if (restuple == NULL)
744 goto onError;
745 if (!PyTuple_Check(restuple)) {
746 PyErr_Format(PyExc_TypeError, &argparse[4]);
747 goto onError;
748 }
749 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
750 goto onError;
751 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000752 newpos = insize+newpos;
753 if (newpos<0 || newpos>insize) {
754 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
755 goto onError;
756 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000757
758 /* need more space? (at least enough for what we
759 have+the replacement+the rest of the string (starting
760 at the new input position), so we won't have to check space
761 when there are no errors in the rest of the string) */
762 repptr = PyUnicode_AS_UNICODE(repunicode);
763 repsize = PyUnicode_GET_SIZE(repunicode);
764 requiredsize = *outpos + repsize + insize-newpos;
765 if (requiredsize > outsize) {
766 if (requiredsize<2*outsize)
767 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000768 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000769 goto onError;
770 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
771 }
772 *endinpos = newpos;
773 *inptr = input + newpos;
774 Py_UNICODE_COPY(*outptr, repptr, repsize);
775 *outptr += repsize;
776 *outpos += repsize;
777 /* we made it! */
778 res = 0;
779
780 onError:
781 Py_XDECREF(restuple);
782 return res;
783}
784
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000785/* --- UTF-7 Codec -------------------------------------------------------- */
786
787/* see RFC2152 for details */
788
Tim Petersced69f82003-09-16 20:30:58 +0000789static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000790char utf7_special[128] = {
791 /* indicate whether a UTF-7 character is special i.e. cannot be directly
792 encoded:
793 0 - not special
794 1 - special
795 2 - whitespace (optional)
796 3 - RFC2152 Set O (optional) */
797 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
798 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
799 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
800 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
801 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
802 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
803 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
804 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
805
806};
807
808#define SPECIAL(c, encodeO, encodeWS) \
809 (((c)>127 || utf7_special[(c)] == 1) || \
810 (encodeWS && (utf7_special[(c)] == 2)) || \
811 (encodeO && (utf7_special[(c)] == 3)))
812
813#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
814#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
815#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
816 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
817
818#define ENCODE(out, ch, bits) \
819 while (bits >= 6) { \
820 *out++ = B64(ch >> (bits-6)); \
821 bits -= 6; \
822 }
823
824#define DECODE(out, ch, bits, surrogate) \
825 while (bits >= 16) { \
826 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
827 bits -= 16; \
828 if (surrogate) { \
829 /* We have already generated an error for the high surrogate
830 so let's not bother seeing if the low surrogate is correct or not */\
831 surrogate = 0; \
832 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
833 /* This is a surrogate pair. Unfortunately we can't represent \
834 it in a 16-bit character */ \
835 surrogate = 1; \
836 errmsg = "code pairs are not supported"; \
837 goto utf7Error; \
838 } else { \
839 *out++ = outCh; \
840 } \
841 } \
842
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000843PyObject *PyUnicode_DecodeUTF7(const char *s,
844 int size,
845 const char *errors)
846{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000847 const char *starts = s;
848 int startinpos;
849 int endinpos;
850 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000851 const char *e;
852 PyUnicodeObject *unicode;
853 Py_UNICODE *p;
854 const char *errmsg = "";
855 int inShift = 0;
856 unsigned int bitsleft = 0;
857 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000858 int surrogate = 0;
859 PyObject *errorHandler = NULL;
860 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000861
862 unicode = _PyUnicode_New(size);
863 if (!unicode)
864 return NULL;
865 if (size == 0)
866 return (PyObject *)unicode;
867
868 p = unicode->str;
869 e = s + size;
870
871 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000872 Py_UNICODE ch;
873 restart:
874 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000875
876 if (inShift) {
877 if ((ch == '-') || !B64CHAR(ch)) {
878 inShift = 0;
879 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000880
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
882 if (bitsleft >= 6) {
883 /* The shift sequence has a partial character in it. If
884 bitsleft < 6 then we could just classify it as padding
885 but that is not the case here */
886
887 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000888 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000889 }
890 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000891 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000892 here so indicate the potential of a misencoded character. */
893
894 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
895 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
896 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000897 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000898 }
899
900 if (ch == '-') {
901 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000902 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000903 inShift = 1;
904 }
905 } else if (SPECIAL(ch,0,0)) {
906 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000907 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000908 } else {
909 *p++ = ch;
910 }
911 } else {
912 charsleft = (charsleft << 6) | UB64(ch);
913 bitsleft += 6;
914 s++;
915 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
916 }
917 }
918 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000919 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000920 s++;
921 if (s < e && *s == '-') {
922 s++;
923 *p++ = '+';
924 } else
925 {
926 inShift = 1;
927 bitsleft = 0;
928 }
929 }
930 else if (SPECIAL(ch,0,0)) {
931 errmsg = "unexpected special character";
932 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000933 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000934 }
935 else {
936 *p++ = ch;
937 s++;
938 }
939 continue;
940 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000941 outpos = p-PyUnicode_AS_UNICODE(unicode);
942 endinpos = s-starts;
943 if (unicode_decode_call_errorhandler(
944 errors, &errorHandler,
945 "utf7", errmsg,
946 starts, size, &startinpos, &endinpos, &exc, &s,
947 (PyObject **)&unicode, &outpos, &p))
948 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 }
950
951 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000952 outpos = p-PyUnicode_AS_UNICODE(unicode);
953 endinpos = size;
954 if (unicode_decode_call_errorhandler(
955 errors, &errorHandler,
956 "utf7", "unterminated shift sequence",
957 starts, size, &startinpos, &endinpos, &exc, &s,
958 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000959 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000960 if (s < e)
961 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000962 }
963
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000964 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000965 goto onError;
966
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000967 Py_XDECREF(errorHandler);
968 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000969 return (PyObject *)unicode;
970
971onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000972 Py_XDECREF(errorHandler);
973 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974 Py_DECREF(unicode);
975 return NULL;
976}
977
978
979PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
980 int size,
981 int encodeSetO,
982 int encodeWhiteSpace,
983 const char *errors)
984{
985 PyObject *v;
986 /* It might be possible to tighten this worst case */
987 unsigned int cbAllocated = 5 * size;
988 int inShift = 0;
989 int i = 0;
990 unsigned int bitsleft = 0;
991 unsigned long charsleft = 0;
992 char * out;
993 char * start;
994
995 if (size == 0)
996 return PyString_FromStringAndSize(NULL, 0);
997
998 v = PyString_FromStringAndSize(NULL, cbAllocated);
999 if (v == NULL)
1000 return NULL;
1001
1002 start = out = PyString_AS_STRING(v);
1003 for (;i < size; ++i) {
1004 Py_UNICODE ch = s[i];
1005
1006 if (!inShift) {
1007 if (ch == '+') {
1008 *out++ = '+';
1009 *out++ = '-';
1010 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1011 charsleft = ch;
1012 bitsleft = 16;
1013 *out++ = '+';
1014 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1015 inShift = bitsleft > 0;
1016 } else {
1017 *out++ = (char) ch;
1018 }
1019 } else {
1020 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1021 *out++ = B64(charsleft << (6-bitsleft));
1022 charsleft = 0;
1023 bitsleft = 0;
1024 /* Characters not in the BASE64 set implicitly unshift the sequence
1025 so no '-' is required, except if the character is itself a '-' */
1026 if (B64CHAR(ch) || ch == '-') {
1027 *out++ = '-';
1028 }
1029 inShift = 0;
1030 *out++ = (char) ch;
1031 } else {
1032 bitsleft += 16;
1033 charsleft = (charsleft << 16) | ch;
1034 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1035
1036 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001037 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001038 or '-' then the shift sequence will be terminated implicitly and we
1039 don't have to insert a '-'. */
1040
1041 if (bitsleft == 0) {
1042 if (i + 1 < size) {
1043 Py_UNICODE ch2 = s[i+1];
1044
1045 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001046
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001047 } else if (B64CHAR(ch2) || ch2 == '-') {
1048 *out++ = '-';
1049 inShift = 0;
1050 } else {
1051 inShift = 0;
1052 }
1053
1054 }
1055 else {
1056 *out++ = '-';
1057 inShift = 0;
1058 }
1059 }
Tim Petersced69f82003-09-16 20:30:58 +00001060 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062 }
1063 if (bitsleft) {
1064 *out++= B64(charsleft << (6-bitsleft) );
1065 *out++ = '-';
1066 }
1067
Tim Peters5de98422002-04-27 18:44:32 +00001068 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001069 return v;
1070}
1071
1072#undef SPECIAL
1073#undef B64
1074#undef B64CHAR
1075#undef UB64
1076#undef ENCODE
1077#undef DECODE
1078
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079/* --- UTF-8 Codec -------------------------------------------------------- */
1080
Tim Petersced69f82003-09-16 20:30:58 +00001081static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001082char utf8_code_length[256] = {
1083 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1084 illegal prefix. see RFC 2279 for details */
1085 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1086 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1087 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1088 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1089 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1090 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1091 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1092 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1093 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1094 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1095 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1096 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1097 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1098 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1099 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1100 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1101};
1102
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103PyObject *PyUnicode_DecodeUTF8(const char *s,
1104 int size,
1105 const char *errors)
1106{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001107 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001109 int startinpos;
1110 int endinpos;
1111 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 const char *e;
1113 PyUnicodeObject *unicode;
1114 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001115 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001116 PyObject *errorHandler = NULL;
1117 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001118
1119 /* Note: size will always be longer than the resulting Unicode
1120 character count */
1121 unicode = _PyUnicode_New(size);
1122 if (!unicode)
1123 return NULL;
1124 if (size == 0)
1125 return (PyObject *)unicode;
1126
1127 /* Unpack UTF-8 encoded data */
1128 p = unicode->str;
1129 e = s + size;
1130
1131 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001132 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001133
1134 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001135 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001136 s++;
1137 continue;
1138 }
1139
1140 n = utf8_code_length[ch];
1141
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001142 if (s + n > e) {
1143 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001144 startinpos = s-starts;
1145 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001146 goto utf8Error;
1147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148
1149 switch (n) {
1150
1151 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001152 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001153 startinpos = s-starts;
1154 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001155 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156
1157 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001158 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001159 startinpos = s-starts;
1160 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001161 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162
1163 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001164 if ((s[1] & 0xc0) != 0x80) {
1165 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001166 startinpos = s-starts;
1167 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001168 goto utf8Error;
1169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001171 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001172 startinpos = s-starts;
1173 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001174 errmsg = "illegal encoding";
1175 goto utf8Error;
1176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001178 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 break;
1180
1181 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001182 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001183 (s[2] & 0xc0) != 0x80) {
1184 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001185 startinpos = s-starts;
1186 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001187 goto utf8Error;
1188 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001190 if (ch < 0x0800) {
1191 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001192 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001193
1194 XXX For wide builds (UCS-4) we should probably try
1195 to recombine the surrogates into a single code
1196 unit.
1197 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001198 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001199 startinpos = s-starts;
1200 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001201 goto utf8Error;
1202 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001204 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001205 break;
1206
1207 case 4:
1208 if ((s[1] & 0xc0) != 0x80 ||
1209 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001210 (s[3] & 0xc0) != 0x80) {
1211 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001212 startinpos = s-starts;
1213 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001214 goto utf8Error;
1215 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001216 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1217 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1218 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001219 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001220 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001221 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001222 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001223 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001224 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001225 startinpos = s-starts;
1226 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001227 goto utf8Error;
1228 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001229#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001230 *p++ = (Py_UNICODE)ch;
1231#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001232 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001233
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001234 /* translate from 10000..10FFFF to 0..FFFF */
1235 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001236
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001237 /* high surrogate = top 10 bits added to D800 */
1238 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001239
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001240 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001241 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001242#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 break;
1244
1245 default:
1246 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001247 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001248 startinpos = s-starts;
1249 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001250 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 }
1252 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001253 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001254
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001255 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001256 outpos = p-PyUnicode_AS_UNICODE(unicode);
1257 if (unicode_decode_call_errorhandler(
1258 errors, &errorHandler,
1259 "utf8", errmsg,
1260 starts, size, &startinpos, &endinpos, &exc, &s,
1261 (PyObject **)&unicode, &outpos, &p))
1262 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001263 }
1264
1265 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001266 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267 goto onError;
1268
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001269 Py_XDECREF(errorHandler);
1270 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001271 return (PyObject *)unicode;
1272
1273onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001274 Py_XDECREF(errorHandler);
1275 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 Py_DECREF(unicode);
1277 return NULL;
1278}
1279
Tim Peters602f7402002-04-27 18:03:26 +00001280/* Allocation strategy: if the string is short, convert into a stack buffer
1281 and allocate exactly as much space needed at the end. Else allocate the
1282 maximum possible needed (4 result bytes per Unicode character), and return
1283 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001284*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001285PyObject *
1286PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1287 int size,
1288 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289{
Tim Peters602f7402002-04-27 18:03:26 +00001290#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001291
Tim Peters602f7402002-04-27 18:03:26 +00001292 int i; /* index into s of next input byte */
1293 PyObject *v; /* result string object */
1294 char *p; /* next free byte in output buffer */
1295 int nallocated; /* number of result bytes allocated */
1296 int nneeded; /* number of result bytes needed */
1297 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001298
Tim Peters602f7402002-04-27 18:03:26 +00001299 assert(s != NULL);
1300 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001301
Tim Peters602f7402002-04-27 18:03:26 +00001302 if (size <= MAX_SHORT_UNICHARS) {
1303 /* Write into the stack buffer; nallocated can't overflow.
1304 * At the end, we'll allocate exactly as much heap space as it
1305 * turns out we need.
1306 */
1307 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1308 v = NULL; /* will allocate after we're done */
1309 p = stackbuf;
1310 }
1311 else {
1312 /* Overallocate on the heap, and give the excess back at the end. */
1313 nallocated = size * 4;
1314 if (nallocated / 4 != size) /* overflow! */
1315 return PyErr_NoMemory();
1316 v = PyString_FromStringAndSize(NULL, nallocated);
1317 if (v == NULL)
1318 return NULL;
1319 p = PyString_AS_STRING(v);
1320 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001321
Tim Peters602f7402002-04-27 18:03:26 +00001322 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001323 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001324
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001325 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001326 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001328
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001330 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001331 *p++ = (char)(0xc0 | (ch >> 6));
1332 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001333 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001334 else {
Tim Peters602f7402002-04-27 18:03:26 +00001335 /* Encode UCS2 Unicode ordinals */
1336 if (ch < 0x10000) {
1337 /* Special case: check for high surrogate */
1338 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1339 Py_UCS4 ch2 = s[i];
1340 /* Check for low surrogate and combine the two to
1341 form a UCS4 value */
1342 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001343 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001344 i++;
1345 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001346 }
Tim Peters602f7402002-04-27 18:03:26 +00001347 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001349 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001350 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1351 *p++ = (char)(0x80 | (ch & 0x3f));
1352 continue;
1353 }
1354encodeUCS4:
1355 /* Encode UCS4 Unicode ordinals */
1356 *p++ = (char)(0xf0 | (ch >> 18));
1357 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1358 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1359 *p++ = (char)(0x80 | (ch & 0x3f));
1360 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001362
Tim Peters602f7402002-04-27 18:03:26 +00001363 if (v == NULL) {
1364 /* This was stack allocated. */
1365 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1366 assert(nneeded <= nallocated);
1367 v = PyString_FromStringAndSize(stackbuf, nneeded);
1368 }
1369 else {
1370 /* Cut back to size actually needed. */
1371 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1372 assert(nneeded <= nallocated);
1373 _PyString_Resize(&v, nneeded);
1374 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001375 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001376
Tim Peters602f7402002-04-27 18:03:26 +00001377#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378}
1379
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1381{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 if (!PyUnicode_Check(unicode)) {
1383 PyErr_BadArgument();
1384 return NULL;
1385 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001386 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1387 PyUnicode_GET_SIZE(unicode),
1388 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389}
1390
1391/* --- UTF-16 Codec ------------------------------------------------------- */
1392
Tim Peters772747b2001-08-09 22:21:55 +00001393PyObject *
1394PyUnicode_DecodeUTF16(const char *s,
1395 int size,
1396 const char *errors,
1397 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001399 const char *starts = s;
1400 int startinpos;
1401 int endinpos;
1402 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001403 PyUnicodeObject *unicode;
1404 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001405 const unsigned char *q, *e;
1406 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001407 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001408 /* Offsets from q for retrieving byte pairs in the right order. */
1409#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1410 int ihi = 1, ilo = 0;
1411#else
1412 int ihi = 0, ilo = 1;
1413#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414 PyObject *errorHandler = NULL;
1415 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001416
1417 /* Note: size will always be longer than the resulting Unicode
1418 character count */
1419 unicode = _PyUnicode_New(size);
1420 if (!unicode)
1421 return NULL;
1422 if (size == 0)
1423 return (PyObject *)unicode;
1424
1425 /* Unpack UTF-16 encoded data */
1426 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001427 q = (unsigned char *)s;
1428 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429
1430 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001431 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001433 /* Check for BOM marks (U+FEFF) in the input and adjust current
1434 byte order setting accordingly. In native mode, the leading BOM
1435 mark is skipped, in all other modes, it is copied to the output
1436 stream as-is (giving a ZWNBSP character). */
1437 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001438 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001439#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001440 if (bom == 0xFEFF) {
1441 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001442 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001443 }
1444 else if (bom == 0xFFFE) {
1445 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001446 bo = 1;
1447 }
Tim Petersced69f82003-09-16 20:30:58 +00001448#else
Tim Peters772747b2001-08-09 22:21:55 +00001449 if (bom == 0xFEFF) {
1450 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001451 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001452 }
1453 else if (bom == 0xFFFE) {
1454 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001455 bo = -1;
1456 }
1457#endif
1458 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459
Tim Peters772747b2001-08-09 22:21:55 +00001460 if (bo == -1) {
1461 /* force LE */
1462 ihi = 1;
1463 ilo = 0;
1464 }
1465 else if (bo == 1) {
1466 /* force BE */
1467 ihi = 0;
1468 ilo = 1;
1469 }
1470
1471 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001472 Py_UNICODE ch;
1473 /* remaing bytes at the end? (size should be even) */
1474 if (e-q<2) {
1475 errmsg = "truncated data";
1476 startinpos = ((const char *)q)-starts;
1477 endinpos = ((const char *)e)-starts;
1478 goto utf16Error;
1479 /* The remaining input chars are ignored if the callback
1480 chooses to skip the input */
1481 }
1482 ch = (q[ihi] << 8) | q[ilo];
1483
Tim Peters772747b2001-08-09 22:21:55 +00001484 q += 2;
1485
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486 if (ch < 0xD800 || ch > 0xDFFF) {
1487 *p++ = ch;
1488 continue;
1489 }
1490
1491 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001492 if (q >= e) {
1493 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001494 startinpos = (((const char *)q)-2)-starts;
1495 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001496 goto utf16Error;
1497 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001498 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001499 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1500 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001501 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001502#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001503 *p++ = ch;
1504 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001505#else
1506 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001507#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001508 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001509 }
1510 else {
1511 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001512 startinpos = (((const char *)q)-4)-starts;
1513 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001514 goto utf16Error;
1515 }
1516
Guido van Rossumd57fd912000-03-10 22:53:23 +00001517 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001518 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001519 startinpos = (((const char *)q)-2)-starts;
1520 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001521 /* Fall through to report the error */
1522
1523 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001524 outpos = p-PyUnicode_AS_UNICODE(unicode);
1525 if (unicode_decode_call_errorhandler(
1526 errors, &errorHandler,
1527 "utf16", errmsg,
1528 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1529 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001530 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001531 }
1532
1533 if (byteorder)
1534 *byteorder = bo;
1535
1536 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001537 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001538 goto onError;
1539
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 Py_XDECREF(errorHandler);
1541 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 return (PyObject *)unicode;
1543
1544onError:
1545 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001546 Py_XDECREF(errorHandler);
1547 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001548 return NULL;
1549}
1550
Tim Peters772747b2001-08-09 22:21:55 +00001551PyObject *
1552PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1553 int size,
1554 const char *errors,
1555 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001556{
1557 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001558 unsigned char *p;
1559 int i, pairs;
1560 /* Offsets from p for storing byte pairs in the right order. */
1561#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1562 int ihi = 1, ilo = 0;
1563#else
1564 int ihi = 0, ilo = 1;
1565#endif
1566
1567#define STORECHAR(CH) \
1568 do { \
1569 p[ihi] = ((CH) >> 8) & 0xff; \
1570 p[ilo] = (CH) & 0xff; \
1571 p += 2; \
1572 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001573
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001574 for (i = pairs = 0; i < size; i++)
1575 if (s[i] >= 0x10000)
1576 pairs++;
Tim Petersced69f82003-09-16 20:30:58 +00001577 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001578 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001579 if (v == NULL)
1580 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001581
Tim Peters772747b2001-08-09 22:21:55 +00001582 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001583 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001584 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001585 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001586 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001587
1588 if (byteorder == -1) {
1589 /* force LE */
1590 ihi = 1;
1591 ilo = 0;
1592 }
1593 else if (byteorder == 1) {
1594 /* force BE */
1595 ihi = 0;
1596 ilo = 1;
1597 }
1598
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001599 while (size-- > 0) {
1600 Py_UNICODE ch = *s++;
1601 Py_UNICODE ch2 = 0;
1602 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001603 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1604 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001605 }
Tim Peters772747b2001-08-09 22:21:55 +00001606 STORECHAR(ch);
1607 if (ch2)
1608 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001610 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001611#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001612}
1613
1614PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1615{
1616 if (!PyUnicode_Check(unicode)) {
1617 PyErr_BadArgument();
1618 return NULL;
1619 }
1620 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1621 PyUnicode_GET_SIZE(unicode),
1622 NULL,
1623 0);
1624}
1625
1626/* --- Unicode Escape Codec ----------------------------------------------- */
1627
Fredrik Lundh06d12682001-01-24 07:59:11 +00001628static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001629
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1631 int size,
1632 const char *errors)
1633{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001634 const char *starts = s;
1635 int startinpos;
1636 int endinpos;
1637 int outpos;
1638 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001639 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001640 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001641 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001642 char* message;
1643 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001644 PyObject *errorHandler = NULL;
1645 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001646
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 /* Escaped strings will always be longer than the resulting
1648 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 length after conversion to the true value.
1650 (but if the error callback returns a long replacement string
1651 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652 v = _PyUnicode_New(size);
1653 if (v == NULL)
1654 goto onError;
1655 if (size == 0)
1656 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001657
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001658 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001659 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001660
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 while (s < end) {
1662 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001663 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001664 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665
1666 /* Non-escape characters are interpreted as Unicode ordinals */
1667 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001668 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 continue;
1670 }
1671
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001672 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 /* \ - Escapes */
1674 s++;
1675 switch (*s++) {
1676
1677 /* \x escapes */
1678 case '\n': break;
1679 case '\\': *p++ = '\\'; break;
1680 case '\'': *p++ = '\''; break;
1681 case '\"': *p++ = '\"'; break;
1682 case 'b': *p++ = '\b'; break;
1683 case 'f': *p++ = '\014'; break; /* FF */
1684 case 't': *p++ = '\t'; break;
1685 case 'n': *p++ = '\n'; break;
1686 case 'r': *p++ = '\r'; break;
1687 case 'v': *p++ = '\013'; break; /* VT */
1688 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1689
1690 /* \OOO (octal) escapes */
1691 case '0': case '1': case '2': case '3':
1692 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001693 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001695 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001697 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001699 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700 break;
1701
Fredrik Lundhccc74732001-02-18 22:13:49 +00001702 /* hex escapes */
1703 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001705 digits = 2;
1706 message = "truncated \\xXX escape";
1707 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708
Fredrik Lundhccc74732001-02-18 22:13:49 +00001709 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001711 digits = 4;
1712 message = "truncated \\uXXXX escape";
1713 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714
Fredrik Lundhccc74732001-02-18 22:13:49 +00001715 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001716 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001717 digits = 8;
1718 message = "truncated \\UXXXXXXXX escape";
1719 hexescape:
1720 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001721 outpos = p-PyUnicode_AS_UNICODE(v);
1722 if (s+digits>end) {
1723 endinpos = size;
1724 if (unicode_decode_call_errorhandler(
1725 errors, &errorHandler,
1726 "unicodeescape", "end of string in escape sequence",
1727 starts, size, &startinpos, &endinpos, &exc, &s,
1728 (PyObject **)&v, &outpos, &p))
1729 goto onError;
1730 goto nextByte;
1731 }
1732 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001733 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001734 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001735 endinpos = (s+i+1)-starts;
1736 if (unicode_decode_call_errorhandler(
1737 errors, &errorHandler,
1738 "unicodeescape", message,
1739 starts, size, &startinpos, &endinpos, &exc, &s,
1740 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001741 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001743 }
1744 chr = (chr<<4) & ~0xF;
1745 if (c >= '0' && c <= '9')
1746 chr += c - '0';
1747 else if (c >= 'a' && c <= 'f')
1748 chr += 10 + c - 'a';
1749 else
1750 chr += 10 + c - 'A';
1751 }
1752 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001753 if (chr == 0xffffffff)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001754 /* _decoding_error will have already written into the
1755 target buffer. */
1756 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001757 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001758 /* when we get here, chr is a 32-bit unicode character */
1759 if (chr <= 0xffff)
1760 /* UCS-2 character */
1761 *p++ = (Py_UNICODE) chr;
1762 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001763 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001764 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001765#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001766 *p++ = chr;
1767#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001768 chr -= 0x10000L;
1769 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001770 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001771#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001772 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001773 endinpos = s-starts;
1774 outpos = p-PyUnicode_AS_UNICODE(v);
1775 if (unicode_decode_call_errorhandler(
1776 errors, &errorHandler,
1777 "unicodeescape", "illegal Unicode character",
1778 starts, size, &startinpos, &endinpos, &exc, &s,
1779 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001780 goto onError;
1781 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001782 break;
1783
1784 /* \N{name} */
1785 case 'N':
1786 message = "malformed \\N character escape";
1787 if (ucnhash_CAPI == NULL) {
1788 /* load the unicode data module */
1789 PyObject *m, *v;
1790 m = PyImport_ImportModule("unicodedata");
1791 if (m == NULL)
1792 goto ucnhashError;
1793 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1794 Py_DECREF(m);
1795 if (v == NULL)
1796 goto ucnhashError;
1797 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1798 Py_DECREF(v);
1799 if (ucnhash_CAPI == NULL)
1800 goto ucnhashError;
1801 }
1802 if (*s == '{') {
1803 const char *start = s+1;
1804 /* look for the closing brace */
1805 while (*s != '}' && s < end)
1806 s++;
1807 if (s > start && s < end && *s == '}') {
1808 /* found a name. look it up in the unicode database */
1809 message = "unknown Unicode character name";
1810 s++;
1811 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1812 goto store;
1813 }
1814 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001815 endinpos = s-starts;
1816 outpos = p-PyUnicode_AS_UNICODE(v);
1817 if (unicode_decode_call_errorhandler(
1818 errors, &errorHandler,
1819 "unicodeescape", message,
1820 starts, size, &startinpos, &endinpos, &exc, &s,
1821 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001822 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001823 break;
1824
1825 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001826 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001827 message = "\\ at end of string";
1828 s--;
1829 endinpos = s-starts;
1830 outpos = p-PyUnicode_AS_UNICODE(v);
1831 if (unicode_decode_call_errorhandler(
1832 errors, &errorHandler,
1833 "unicodeescape", message,
1834 starts, size, &startinpos, &endinpos, &exc, &s,
1835 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001836 goto onError;
1837 }
1838 else {
1839 *p++ = '\\';
1840 *p++ = (unsigned char)s[-1];
1841 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001842 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001843 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001844 nextByte:
1845 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001847 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001848 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001849 Py_XDECREF(errorHandler);
1850 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001852
Fredrik Lundhccc74732001-02-18 22:13:49 +00001853ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001854 PyErr_SetString(
1855 PyExc_UnicodeError,
1856 "\\N escapes not supported (can't load unicodedata module)"
1857 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 Py_XDECREF(errorHandler);
1859 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001860 return NULL;
1861
Fredrik Lundhccc74732001-02-18 22:13:49 +00001862onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001863 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001864 Py_XDECREF(errorHandler);
1865 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866 return NULL;
1867}
1868
1869/* Return a Unicode-Escape string version of the Unicode object.
1870
1871 If quotes is true, the string is enclosed in u"" or u'' quotes as
1872 appropriate.
1873
1874*/
1875
Barry Warsaw51ac5802000-03-20 16:36:48 +00001876static const Py_UNICODE *findchar(const Py_UNICODE *s,
1877 int size,
1878 Py_UNICODE ch);
1879
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880static
1881PyObject *unicodeescape_string(const Py_UNICODE *s,
1882 int size,
1883 int quotes)
1884{
1885 PyObject *repr;
1886 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001887
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001888 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889
1890 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1891 if (repr == NULL)
1892 return NULL;
1893
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001894 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895
1896 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001897 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001898 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 !findchar(s, size, '"')) ? '"' : '\'';
1900 }
1901 while (size-- > 0) {
1902 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001903
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001905 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001906 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001907 *p++ = '\\';
1908 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001909 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001910 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001911
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001912#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001913 /* Map 21-bit characters to '\U00xxxxxx' */
1914 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001915 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00001916
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001917 /* Resize the string if necessary */
1918 if (offset + 12 > PyString_GET_SIZE(repr)) {
1919 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001920 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001921 p = PyString_AS_STRING(repr) + offset;
1922 }
1923
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001924 *p++ = '\\';
1925 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001926 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1927 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1928 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1929 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1930 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1931 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1932 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001933 *p++ = hexdigit[ch & 0x0000000F];
1934 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001935 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001936#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001937 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1938 else if (ch >= 0xD800 && ch < 0xDC00) {
1939 Py_UNICODE ch2;
1940 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00001941
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001942 ch2 = *s++;
1943 size--;
1944 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1945 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1946 *p++ = '\\';
1947 *p++ = 'U';
1948 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1949 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1950 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1951 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1952 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1953 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1954 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1955 *p++ = hexdigit[ucs & 0x0000000F];
1956 continue;
1957 }
1958 /* Fall through: isolated surrogates are copied as-is */
1959 s--;
1960 size++;
1961 }
1962
Guido van Rossumd57fd912000-03-10 22:53:23 +00001963 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001964 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001965 *p++ = '\\';
1966 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001967 *p++ = hexdigit[(ch >> 12) & 0x000F];
1968 *p++ = hexdigit[(ch >> 8) & 0x000F];
1969 *p++ = hexdigit[(ch >> 4) & 0x000F];
1970 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001972
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001973 /* Map special whitespace to '\t', \n', '\r' */
1974 else if (ch == '\t') {
1975 *p++ = '\\';
1976 *p++ = 't';
1977 }
1978 else if (ch == '\n') {
1979 *p++ = '\\';
1980 *p++ = 'n';
1981 }
1982 else if (ch == '\r') {
1983 *p++ = '\\';
1984 *p++ = 'r';
1985 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001986
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001987 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001988 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001990 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001991 *p++ = hexdigit[(ch >> 4) & 0x000F];
1992 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00001993 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001994
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995 /* Copy everything else as-is */
1996 else
1997 *p++ = (char) ch;
1998 }
1999 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002000 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001
2002 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002003 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002004 return repr;
2005}
2006
2007PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2008 int size)
2009{
2010 return unicodeescape_string(s, size, 0);
2011}
2012
2013PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2014{
2015 if (!PyUnicode_Check(unicode)) {
2016 PyErr_BadArgument();
2017 return NULL;
2018 }
2019 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2020 PyUnicode_GET_SIZE(unicode));
2021}
2022
2023/* --- Raw Unicode Escape Codec ------------------------------------------- */
2024
2025PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2026 int size,
2027 const char *errors)
2028{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002029 const char *starts = s;
2030 int startinpos;
2031 int endinpos;
2032 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002034 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 const char *end;
2036 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002037 PyObject *errorHandler = NULL;
2038 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002039
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040 /* Escaped strings will always be longer than the resulting
2041 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002042 length after conversion to the true value. (But decoding error
2043 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 v = _PyUnicode_New(size);
2045 if (v == NULL)
2046 goto onError;
2047 if (size == 0)
2048 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002049 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 end = s + size;
2051 while (s < end) {
2052 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002053 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002055 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056
2057 /* Non-escape characters are interpreted as Unicode ordinals */
2058 if (*s != '\\') {
2059 *p++ = (unsigned char)*s++;
2060 continue;
2061 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002062 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063
2064 /* \u-escapes are only interpreted iff the number of leading
2065 backslashes if odd */
2066 bs = s;
2067 for (;s < end;) {
2068 if (*s != '\\')
2069 break;
2070 *p++ = (unsigned char)*s++;
2071 }
2072 if (((s - bs) & 1) == 0 ||
2073 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002074 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075 continue;
2076 }
2077 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002078 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079 s++;
2080
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002081 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002082 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002083 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002084 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002086 endinpos = s-starts;
2087 if (unicode_decode_call_errorhandler(
2088 errors, &errorHandler,
2089 "rawunicodeescape", "truncated \\uXXXX",
2090 starts, size, &startinpos, &endinpos, &exc, &s,
2091 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002093 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 }
2095 x = (x<<4) & ~0xF;
2096 if (c >= '0' && c <= '9')
2097 x += c - '0';
2098 else if (c >= 'a' && c <= 'f')
2099 x += 10 + c - 'a';
2100 else
2101 x += 10 + c - 'A';
2102 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002103#ifndef Py_UNICODE_WIDE
2104 if (x > 0x10000) {
2105 if (unicode_decode_call_errorhandler(
2106 errors, &errorHandler,
2107 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2108 starts, size, &startinpos, &endinpos, &exc, &s,
2109 (PyObject **)&v, &outpos, &p))
2110 goto onError;
2111 }
2112#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002113 *p++ = x;
2114 nextByte:
2115 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002117 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002118 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002119 Py_XDECREF(errorHandler);
2120 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002122
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 onError:
2124 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002125 Py_XDECREF(errorHandler);
2126 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127 return NULL;
2128}
2129
2130PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2131 int size)
2132{
2133 PyObject *repr;
2134 char *p;
2135 char *q;
2136
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002137 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002138
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002139#ifdef Py_UNICODE_WIDE
2140 repr = PyString_FromStringAndSize(NULL, 10 * size);
2141#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002143#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 if (repr == NULL)
2145 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002146 if (size == 0)
2147 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148
2149 p = q = PyString_AS_STRING(repr);
2150 while (size-- > 0) {
2151 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002152#ifdef Py_UNICODE_WIDE
2153 /* Map 32-bit characters to '\Uxxxxxxxx' */
2154 if (ch >= 0x10000) {
2155 *p++ = '\\';
2156 *p++ = 'U';
2157 *p++ = hexdigit[(ch >> 28) & 0xf];
2158 *p++ = hexdigit[(ch >> 24) & 0xf];
2159 *p++ = hexdigit[(ch >> 20) & 0xf];
2160 *p++ = hexdigit[(ch >> 16) & 0xf];
2161 *p++ = hexdigit[(ch >> 12) & 0xf];
2162 *p++ = hexdigit[(ch >> 8) & 0xf];
2163 *p++ = hexdigit[(ch >> 4) & 0xf];
2164 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002165 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002166 else
2167#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168 /* Map 16-bit characters to '\uxxxx' */
2169 if (ch >= 256) {
2170 *p++ = '\\';
2171 *p++ = 'u';
2172 *p++ = hexdigit[(ch >> 12) & 0xf];
2173 *p++ = hexdigit[(ch >> 8) & 0xf];
2174 *p++ = hexdigit[(ch >> 4) & 0xf];
2175 *p++ = hexdigit[ch & 15];
2176 }
2177 /* Copy everything else as-is */
2178 else
2179 *p++ = (char) ch;
2180 }
2181 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002182 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 return repr;
2184}
2185
2186PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2187{
2188 if (!PyUnicode_Check(unicode)) {
2189 PyErr_BadArgument();
2190 return NULL;
2191 }
2192 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2193 PyUnicode_GET_SIZE(unicode));
2194}
2195
2196/* --- Latin-1 Codec ------------------------------------------------------ */
2197
2198PyObject *PyUnicode_DecodeLatin1(const char *s,
2199 int size,
2200 const char *errors)
2201{
2202 PyUnicodeObject *v;
2203 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002204
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002206 if (size == 1 && *(unsigned char*)s < 256) {
2207 Py_UNICODE r = *(unsigned char*)s;
2208 return PyUnicode_FromUnicode(&r, 1);
2209 }
2210
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 v = _PyUnicode_New(size);
2212 if (v == NULL)
2213 goto onError;
2214 if (size == 0)
2215 return (PyObject *)v;
2216 p = PyUnicode_AS_UNICODE(v);
2217 while (size-- > 0)
2218 *p++ = (unsigned char)*s++;
2219 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002220
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221 onError:
2222 Py_XDECREF(v);
2223 return NULL;
2224}
2225
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002226/* create or adjust a UnicodeEncodeError */
2227static void make_encode_exception(PyObject **exceptionObject,
2228 const char *encoding,
2229 const Py_UNICODE *unicode, int size,
2230 int startpos, int endpos,
2231 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002233 if (*exceptionObject == NULL) {
2234 *exceptionObject = PyUnicodeEncodeError_Create(
2235 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236 }
2237 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002238 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2239 goto onError;
2240 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2241 goto onError;
2242 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2243 goto onError;
2244 return;
2245 onError:
2246 Py_DECREF(*exceptionObject);
2247 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 }
2249}
2250
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002251/* raises a UnicodeEncodeError */
2252static void raise_encode_exception(PyObject **exceptionObject,
2253 const char *encoding,
2254 const Py_UNICODE *unicode, int size,
2255 int startpos, int endpos,
2256 const char *reason)
2257{
2258 make_encode_exception(exceptionObject,
2259 encoding, unicode, size, startpos, endpos, reason);
2260 if (*exceptionObject != NULL)
2261 PyCodec_StrictErrors(*exceptionObject);
2262}
2263
2264/* error handling callback helper:
2265 build arguments, call the callback and check the arguments,
2266 put the result into newpos and return the replacement string, which
2267 has to be freed by the caller */
2268static PyObject *unicode_encode_call_errorhandler(const char *errors,
2269 PyObject **errorHandler,
2270 const char *encoding, const char *reason,
2271 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2272 int startpos, int endpos,
2273 int *newpos)
2274{
2275 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2276
2277 PyObject *restuple;
2278 PyObject *resunicode;
2279
2280 if (*errorHandler == NULL) {
2281 *errorHandler = PyCodec_LookupError(errors);
2282 if (*errorHandler == NULL)
2283 return NULL;
2284 }
2285
2286 make_encode_exception(exceptionObject,
2287 encoding, unicode, size, startpos, endpos, reason);
2288 if (*exceptionObject == NULL)
2289 return NULL;
2290
2291 restuple = PyObject_CallFunctionObjArgs(
2292 *errorHandler, *exceptionObject, NULL);
2293 if (restuple == NULL)
2294 return NULL;
2295 if (!PyTuple_Check(restuple)) {
2296 PyErr_Format(PyExc_TypeError, &argparse[4]);
2297 Py_DECREF(restuple);
2298 return NULL;
2299 }
2300 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2301 &resunicode, newpos)) {
2302 Py_DECREF(restuple);
2303 return NULL;
2304 }
2305 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002306 *newpos = size+*newpos;
2307 if (*newpos<0 || *newpos>size) {
2308 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2309 Py_DECREF(restuple);
2310 return NULL;
2311 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002312 Py_INCREF(resunicode);
2313 Py_DECREF(restuple);
2314 return resunicode;
2315}
2316
2317static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2318 int size,
2319 const char *errors,
2320 int limit)
2321{
2322 /* output object */
2323 PyObject *res;
2324 /* pointers to the beginning and end+1 of input */
2325 const Py_UNICODE *startp = p;
2326 const Py_UNICODE *endp = p + size;
2327 /* pointer to the beginning of the unencodable characters */
2328 /* const Py_UNICODE *badp = NULL; */
2329 /* pointer into the output */
2330 char *str;
2331 /* current output position */
2332 int respos = 0;
2333 int ressize;
2334 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2335 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2336 PyObject *errorHandler = NULL;
2337 PyObject *exc = NULL;
2338 /* the following variable is used for caching string comparisons
2339 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2340 int known_errorHandler = -1;
2341
2342 /* allocate enough for a simple encoding without
2343 replacements, if we need more, we'll resize */
2344 res = PyString_FromStringAndSize(NULL, size);
2345 if (res == NULL)
2346 goto onError;
2347 if (size == 0)
2348 return res;
2349 str = PyString_AS_STRING(res);
2350 ressize = size;
2351
2352 while (p<endp) {
2353 Py_UNICODE c = *p;
2354
2355 /* can we encode this? */
2356 if (c<limit) {
2357 /* no overflow check, because we know that the space is enough */
2358 *str++ = (char)c;
2359 ++p;
2360 }
2361 else {
2362 int unicodepos = p-startp;
2363 int requiredsize;
2364 PyObject *repunicode;
2365 int repsize;
2366 int newpos;
2367 int respos;
2368 Py_UNICODE *uni2;
2369 /* startpos for collecting unencodable chars */
2370 const Py_UNICODE *collstart = p;
2371 const Py_UNICODE *collend = p;
2372 /* find all unecodable characters */
2373 while ((collend < endp) && ((*collend)>=limit))
2374 ++collend;
2375 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2376 if (known_errorHandler==-1) {
2377 if ((errors==NULL) || (!strcmp(errors, "strict")))
2378 known_errorHandler = 1;
2379 else if (!strcmp(errors, "replace"))
2380 known_errorHandler = 2;
2381 else if (!strcmp(errors, "ignore"))
2382 known_errorHandler = 3;
2383 else if (!strcmp(errors, "xmlcharrefreplace"))
2384 known_errorHandler = 4;
2385 else
2386 known_errorHandler = 0;
2387 }
2388 switch (known_errorHandler) {
2389 case 1: /* strict */
2390 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2391 goto onError;
2392 case 2: /* replace */
2393 while (collstart++<collend)
2394 *str++ = '?'; /* fall through */
2395 case 3: /* ignore */
2396 p = collend;
2397 break;
2398 case 4: /* xmlcharrefreplace */
2399 respos = str-PyString_AS_STRING(res);
2400 /* determine replacement size (temporarily (mis)uses p) */
2401 for (p = collstart, repsize = 0; p < collend; ++p) {
2402 if (*p<10)
2403 repsize += 2+1+1;
2404 else if (*p<100)
2405 repsize += 2+2+1;
2406 else if (*p<1000)
2407 repsize += 2+3+1;
2408 else if (*p<10000)
2409 repsize += 2+4+1;
2410 else if (*p<100000)
2411 repsize += 2+5+1;
2412 else if (*p<1000000)
2413 repsize += 2+6+1;
2414 else
2415 repsize += 2+7+1;
2416 }
2417 requiredsize = respos+repsize+(endp-collend);
2418 if (requiredsize > ressize) {
2419 if (requiredsize<2*ressize)
2420 requiredsize = 2*ressize;
2421 if (_PyString_Resize(&res, requiredsize))
2422 goto onError;
2423 str = PyString_AS_STRING(res) + respos;
2424 ressize = requiredsize;
2425 }
2426 /* generate replacement (temporarily (mis)uses p) */
2427 for (p = collstart; p < collend; ++p) {
2428 str += sprintf(str, "&#%d;", (int)*p);
2429 }
2430 p = collend;
2431 break;
2432 default:
2433 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2434 encoding, reason, startp, size, &exc,
2435 collstart-startp, collend-startp, &newpos);
2436 if (repunicode == NULL)
2437 goto onError;
2438 /* need more space? (at least enough for what we
2439 have+the replacement+the rest of the string, so
2440 we won't have to check space for encodable characters) */
2441 respos = str-PyString_AS_STRING(res);
2442 repsize = PyUnicode_GET_SIZE(repunicode);
2443 requiredsize = respos+repsize+(endp-collend);
2444 if (requiredsize > ressize) {
2445 if (requiredsize<2*ressize)
2446 requiredsize = 2*ressize;
2447 if (_PyString_Resize(&res, requiredsize)) {
2448 Py_DECREF(repunicode);
2449 goto onError;
2450 }
2451 str = PyString_AS_STRING(res) + respos;
2452 ressize = requiredsize;
2453 }
2454 /* check if there is anything unencodable in the replacement
2455 and copy it to the output */
2456 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2457 c = *uni2;
2458 if (c >= limit) {
2459 raise_encode_exception(&exc, encoding, startp, size,
2460 unicodepos, unicodepos+1, reason);
2461 Py_DECREF(repunicode);
2462 goto onError;
2463 }
2464 *str = (char)c;
2465 }
2466 p = startp + newpos;
2467 Py_DECREF(repunicode);
2468 }
2469 }
2470 }
2471 /* Resize if we allocated to much */
2472 respos = str-PyString_AS_STRING(res);
2473 if (respos<ressize)
2474 /* If this falls res will be NULL */
2475 _PyString_Resize(&res, respos);
2476 Py_XDECREF(errorHandler);
2477 Py_XDECREF(exc);
2478 return res;
2479
2480 onError:
2481 Py_XDECREF(res);
2482 Py_XDECREF(errorHandler);
2483 Py_XDECREF(exc);
2484 return NULL;
2485}
2486
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2488 int size,
2489 const char *errors)
2490{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002491 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492}
2493
2494PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2495{
2496 if (!PyUnicode_Check(unicode)) {
2497 PyErr_BadArgument();
2498 return NULL;
2499 }
2500 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2501 PyUnicode_GET_SIZE(unicode),
2502 NULL);
2503}
2504
2505/* --- 7-bit ASCII Codec -------------------------------------------------- */
2506
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507PyObject *PyUnicode_DecodeASCII(const char *s,
2508 int size,
2509 const char *errors)
2510{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002511 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512 PyUnicodeObject *v;
2513 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002514 int startinpos;
2515 int endinpos;
2516 int outpos;
2517 const char *e;
2518 PyObject *errorHandler = NULL;
2519 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002520
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002522 if (size == 1 && *(unsigned char*)s < 128) {
2523 Py_UNICODE r = *(unsigned char*)s;
2524 return PyUnicode_FromUnicode(&r, 1);
2525 }
Tim Petersced69f82003-09-16 20:30:58 +00002526
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527 v = _PyUnicode_New(size);
2528 if (v == NULL)
2529 goto onError;
2530 if (size == 0)
2531 return (PyObject *)v;
2532 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002533 e = s + size;
2534 while (s < e) {
2535 register unsigned char c = (unsigned char)*s;
2536 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002538 ++s;
2539 }
2540 else {
2541 startinpos = s-starts;
2542 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002543 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002544 if (unicode_decode_call_errorhandler(
2545 errors, &errorHandler,
2546 "ascii", "ordinal not in range(128)",
2547 starts, size, &startinpos, &endinpos, &exc, &s,
2548 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002552 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002553 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002554 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002555 Py_XDECREF(errorHandler);
2556 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002558
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 onError:
2560 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002561 Py_XDECREF(errorHandler);
2562 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563 return NULL;
2564}
2565
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2567 int size,
2568 const char *errors)
2569{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002570 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002571}
2572
2573PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2574{
2575 if (!PyUnicode_Check(unicode)) {
2576 PyErr_BadArgument();
2577 return NULL;
2578 }
2579 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2580 PyUnicode_GET_SIZE(unicode),
2581 NULL);
2582}
2583
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002584#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002585
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002586/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002587
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002588PyObject *PyUnicode_DecodeMBCS(const char *s,
2589 int size,
2590 const char *errors)
2591{
2592 PyUnicodeObject *v;
2593 Py_UNICODE *p;
2594
2595 /* First get the size of the result */
2596 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002597 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002598 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2599
2600 v = _PyUnicode_New(usize);
2601 if (v == NULL)
2602 return NULL;
2603 if (usize == 0)
2604 return (PyObject *)v;
2605 p = PyUnicode_AS_UNICODE(v);
2606 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2607 Py_DECREF(v);
2608 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2609 }
2610
2611 return (PyObject *)v;
2612}
2613
2614PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2615 int size,
2616 const char *errors)
2617{
2618 PyObject *repr;
2619 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002620 DWORD mbcssize;
2621
2622 /* If there are no characters, bail now! */
2623 if (size==0)
2624 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002625
2626 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002627 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002628 if (mbcssize==0)
2629 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2630
2631 repr = PyString_FromStringAndSize(NULL, mbcssize);
2632 if (repr == NULL)
2633 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002634 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002635 return repr;
2636
2637 /* Do the conversion */
2638 s = PyString_AS_STRING(repr);
2639 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2640 Py_DECREF(repr);
2641 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2642 }
2643 return repr;
2644}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002645
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002646PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2647{
2648 if (!PyUnicode_Check(unicode)) {
2649 PyErr_BadArgument();
2650 return NULL;
2651 }
2652 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2653 PyUnicode_GET_SIZE(unicode),
2654 NULL);
2655}
2656
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002657#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002658
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659/* --- Character Mapping Codec -------------------------------------------- */
2660
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661PyObject *PyUnicode_DecodeCharmap(const char *s,
2662 int size,
2663 PyObject *mapping,
2664 const char *errors)
2665{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002666 const char *starts = s;
2667 int startinpos;
2668 int endinpos;
2669 int outpos;
2670 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 PyUnicodeObject *v;
2672 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002673 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002674 PyObject *errorHandler = NULL;
2675 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002676
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 /* Default to Latin-1 */
2678 if (mapping == NULL)
2679 return PyUnicode_DecodeLatin1(s, size, errors);
2680
2681 v = _PyUnicode_New(size);
2682 if (v == NULL)
2683 goto onError;
2684 if (size == 0)
2685 return (PyObject *)v;
2686 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002687 e = s + size;
2688 while (s < e) {
2689 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 PyObject *w, *x;
2691
2692 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2693 w = PyInt_FromLong((long)ch);
2694 if (w == NULL)
2695 goto onError;
2696 x = PyObject_GetItem(mapping, w);
2697 Py_DECREF(w);
2698 if (x == NULL) {
2699 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002700 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002702 x = Py_None;
2703 Py_INCREF(x);
2704 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002705 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 }
2707
2708 /* Apply mapping */
2709 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002710 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002711 if (value < 0 || value > 65535) {
2712 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002713 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714 Py_DECREF(x);
2715 goto onError;
2716 }
2717 *p++ = (Py_UNICODE)value;
2718 }
2719 else if (x == Py_None) {
2720 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002721 outpos = p-PyUnicode_AS_UNICODE(v);
2722 startinpos = s-starts;
2723 endinpos = startinpos+1;
2724 if (unicode_decode_call_errorhandler(
2725 errors, &errorHandler,
2726 "charmap", "character maps to <undefined>",
2727 starts, size, &startinpos, &endinpos, &exc, &s,
2728 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729 Py_DECREF(x);
2730 goto onError;
2731 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002732 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733 }
2734 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002735 int targetsize = PyUnicode_GET_SIZE(x);
2736
2737 if (targetsize == 1)
2738 /* 1-1 mapping */
2739 *p++ = *PyUnicode_AS_UNICODE(x);
2740
2741 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002743 if (targetsize > extrachars) {
2744 /* resize first */
2745 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2746 int needed = (targetsize - extrachars) + \
2747 (targetsize << 2);
2748 extrachars += needed;
Tim Petersced69f82003-09-16 20:30:58 +00002749 if (_PyUnicode_Resize(&v,
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002750 PyUnicode_GET_SIZE(v) + needed) < 0) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002751 Py_DECREF(x);
2752 goto onError;
2753 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002754 p = PyUnicode_AS_UNICODE(v) + oldpos;
2755 }
2756 Py_UNICODE_COPY(p,
2757 PyUnicode_AS_UNICODE(x),
2758 targetsize);
2759 p += targetsize;
2760 extrachars -= targetsize;
2761 }
2762 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763 }
2764 else {
2765 /* wrong return value */
2766 PyErr_SetString(PyExc_TypeError,
2767 "character mapping must return integer, None or unicode");
2768 Py_DECREF(x);
2769 goto onError;
2770 }
2771 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002772 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 }
2774 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002775 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002777 Py_XDECREF(errorHandler);
2778 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002780
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002782 Py_XDECREF(errorHandler);
2783 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 Py_XDECREF(v);
2785 return NULL;
2786}
2787
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002788/* Lookup the character ch in the mapping. If the character
2789 can't be found, Py_None is returned (or NULL, if another
2790 error occured). */
2791static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002793 PyObject *w = PyInt_FromLong((long)c);
2794 PyObject *x;
2795
2796 if (w == NULL)
2797 return NULL;
2798 x = PyObject_GetItem(mapping, w);
2799 Py_DECREF(w);
2800 if (x == NULL) {
2801 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2802 /* No mapping found means: mapping is undefined. */
2803 PyErr_Clear();
2804 x = Py_None;
2805 Py_INCREF(x);
2806 return x;
2807 } else
2808 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002810 else if (x == Py_None)
2811 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002812 else if (PyInt_Check(x)) {
2813 long value = PyInt_AS_LONG(x);
2814 if (value < 0 || value > 255) {
2815 PyErr_SetString(PyExc_TypeError,
2816 "character mapping must be in range(256)");
2817 Py_DECREF(x);
2818 return NULL;
2819 }
2820 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822 else if (PyString_Check(x))
2823 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002825 /* wrong return value */
2826 PyErr_SetString(PyExc_TypeError,
2827 "character mapping must return integer, None or str");
2828 Py_DECREF(x);
2829 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830 }
2831}
2832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002833/* lookup the character, put the result in the output string and adjust
2834 various state variables. Reallocate the output string if not enough
2835 space is available. Return a new reference to the object that
2836 was put in the output buffer, or Py_None, if the mapping was undefined
2837 (in which case no character was written) or NULL, if a
2838 reallocation error ocurred. The called must decref the result */
2839static
2840PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2841 PyObject **outobj, int *outpos)
2842{
2843 PyObject *rep = charmapencode_lookup(c, mapping);
2844
2845 if (rep==NULL)
2846 return NULL;
2847 else if (rep==Py_None)
2848 return rep;
2849 else {
2850 char *outstart = PyString_AS_STRING(*outobj);
2851 int outsize = PyString_GET_SIZE(*outobj);
2852 if (PyInt_Check(rep)) {
2853 int requiredsize = *outpos+1;
2854 if (outsize<requiredsize) {
2855 /* exponentially overallocate to minimize reallocations */
2856 if (requiredsize < 2*outsize)
2857 requiredsize = 2*outsize;
2858 if (_PyString_Resize(outobj, requiredsize)) {
2859 Py_DECREF(rep);
2860 return NULL;
2861 }
2862 outstart = PyString_AS_STRING(*outobj);
2863 }
2864 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2865 }
2866 else {
2867 const char *repchars = PyString_AS_STRING(rep);
2868 int repsize = PyString_GET_SIZE(rep);
2869 int requiredsize = *outpos+repsize;
2870 if (outsize<requiredsize) {
2871 /* exponentially overallocate to minimize reallocations */
2872 if (requiredsize < 2*outsize)
2873 requiredsize = 2*outsize;
2874 if (_PyString_Resize(outobj, requiredsize)) {
2875 Py_DECREF(rep);
2876 return NULL;
2877 }
2878 outstart = PyString_AS_STRING(*outobj);
2879 }
2880 memcpy(outstart + *outpos, repchars, repsize);
2881 *outpos += repsize;
2882 }
2883 }
2884 return rep;
2885}
2886
2887/* handle an error in PyUnicode_EncodeCharmap
2888 Return 0 on success, -1 on error */
2889static
2890int charmap_encoding_error(
2891 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2892 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002893 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002894 PyObject **res, int *respos)
2895{
2896 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2897 int repsize;
2898 int newpos;
2899 Py_UNICODE *uni2;
2900 /* startpos for collecting unencodable chars */
2901 int collstartpos = *inpos;
2902 int collendpos = *inpos+1;
2903 int collpos;
2904 char *encoding = "charmap";
2905 char *reason = "character maps to <undefined>";
2906
2907 PyObject *x;
2908 /* find all unencodable characters */
2909 while (collendpos < size) {
2910 x = charmapencode_lookup(p[collendpos], mapping);
2911 if (x==NULL)
2912 return -1;
2913 else if (x!=Py_None) {
2914 Py_DECREF(x);
2915 break;
2916 }
2917 Py_DECREF(x);
2918 ++collendpos;
2919 }
2920 /* cache callback name lookup
2921 * (if not done yet, i.e. it's the first error) */
2922 if (*known_errorHandler==-1) {
2923 if ((errors==NULL) || (!strcmp(errors, "strict")))
2924 *known_errorHandler = 1;
2925 else if (!strcmp(errors, "replace"))
2926 *known_errorHandler = 2;
2927 else if (!strcmp(errors, "ignore"))
2928 *known_errorHandler = 3;
2929 else if (!strcmp(errors, "xmlcharrefreplace"))
2930 *known_errorHandler = 4;
2931 else
2932 *known_errorHandler = 0;
2933 }
2934 switch (*known_errorHandler) {
2935 case 1: /* strict */
2936 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2937 return -1;
2938 case 2: /* replace */
2939 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2940 x = charmapencode_output('?', mapping, res, respos);
2941 if (x==NULL) {
2942 return -1;
2943 }
2944 else if (x==Py_None) {
2945 Py_DECREF(x);
2946 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2947 return -1;
2948 }
2949 Py_DECREF(x);
2950 }
2951 /* fall through */
2952 case 3: /* ignore */
2953 *inpos = collendpos;
2954 break;
2955 case 4: /* xmlcharrefreplace */
2956 /* generate replacement (temporarily (mis)uses p) */
2957 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2958 char buffer[2+29+1+1];
2959 char *cp;
2960 sprintf(buffer, "&#%d;", (int)p[collpos]);
2961 for (cp = buffer; *cp; ++cp) {
2962 x = charmapencode_output(*cp, mapping, res, respos);
2963 if (x==NULL)
2964 return -1;
2965 else if (x==Py_None) {
2966 Py_DECREF(x);
2967 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2968 return -1;
2969 }
2970 Py_DECREF(x);
2971 }
2972 }
2973 *inpos = collendpos;
2974 break;
2975 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002976 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002977 encoding, reason, p, size, exceptionObject,
2978 collstartpos, collendpos, &newpos);
2979 if (repunicode == NULL)
2980 return -1;
2981 /* generate replacement */
2982 repsize = PyUnicode_GET_SIZE(repunicode);
2983 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2984 x = charmapencode_output(*uni2, mapping, res, respos);
2985 if (x==NULL) {
2986 Py_DECREF(repunicode);
2987 return -1;
2988 }
2989 else if (x==Py_None) {
2990 Py_DECREF(repunicode);
2991 Py_DECREF(x);
2992 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2993 return -1;
2994 }
2995 Py_DECREF(x);
2996 }
2997 *inpos = newpos;
2998 Py_DECREF(repunicode);
2999 }
3000 return 0;
3001}
3002
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3004 int size,
3005 PyObject *mapping,
3006 const char *errors)
3007{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003008 /* output object */
3009 PyObject *res = NULL;
3010 /* current input position */
3011 int inpos = 0;
3012 /* current output position */
3013 int respos = 0;
3014 PyObject *errorHandler = NULL;
3015 PyObject *exc = NULL;
3016 /* the following variable is used for caching string comparisons
3017 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3018 * 3=ignore, 4=xmlcharrefreplace */
3019 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020
3021 /* Default to Latin-1 */
3022 if (mapping == NULL)
3023 return PyUnicode_EncodeLatin1(p, size, errors);
3024
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003025 /* allocate enough for a simple encoding without
3026 replacements, if we need more, we'll resize */
3027 res = PyString_FromStringAndSize(NULL, size);
3028 if (res == NULL)
3029 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003030 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003031 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003033 while (inpos<size) {
3034 /* try to encode it */
3035 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3036 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003038 if (x==Py_None) { /* unencodable character */
3039 if (charmap_encoding_error(p, size, &inpos, mapping,
3040 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003041 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003042 &res, &respos)) {
3043 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003044 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003047 else
3048 /* done with this character => adjust input position */
3049 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 Py_DECREF(x);
3051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 /* Resize if we allocated to much */
3054 if (respos<PyString_GET_SIZE(res)) {
3055 if (_PyString_Resize(&res, respos))
3056 goto onError;
3057 }
3058 Py_XDECREF(exc);
3059 Py_XDECREF(errorHandler);
3060 return res;
3061
3062 onError:
3063 Py_XDECREF(res);
3064 Py_XDECREF(exc);
3065 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003066 return NULL;
3067}
3068
3069PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3070 PyObject *mapping)
3071{
3072 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3073 PyErr_BadArgument();
3074 return NULL;
3075 }
3076 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3077 PyUnicode_GET_SIZE(unicode),
3078 mapping,
3079 NULL);
3080}
3081
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003082/* create or adjust a UnicodeTranslateError */
3083static void make_translate_exception(PyObject **exceptionObject,
3084 const Py_UNICODE *unicode, int size,
3085 int startpos, int endpos,
3086 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003088 if (*exceptionObject == NULL) {
3089 *exceptionObject = PyUnicodeTranslateError_Create(
3090 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 }
3092 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003093 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3094 goto onError;
3095 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3096 goto onError;
3097 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3098 goto onError;
3099 return;
3100 onError:
3101 Py_DECREF(*exceptionObject);
3102 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103 }
3104}
3105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003106/* raises a UnicodeTranslateError */
3107static void raise_translate_exception(PyObject **exceptionObject,
3108 const Py_UNICODE *unicode, int size,
3109 int startpos, int endpos,
3110 const char *reason)
3111{
3112 make_translate_exception(exceptionObject,
3113 unicode, size, startpos, endpos, reason);
3114 if (*exceptionObject != NULL)
3115 PyCodec_StrictErrors(*exceptionObject);
3116}
3117
3118/* error handling callback helper:
3119 build arguments, call the callback and check the arguments,
3120 put the result into newpos and return the replacement string, which
3121 has to be freed by the caller */
3122static PyObject *unicode_translate_call_errorhandler(const char *errors,
3123 PyObject **errorHandler,
3124 const char *reason,
3125 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3126 int startpos, int endpos,
3127 int *newpos)
3128{
3129 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3130
3131 PyObject *restuple;
3132 PyObject *resunicode;
3133
3134 if (*errorHandler == NULL) {
3135 *errorHandler = PyCodec_LookupError(errors);
3136 if (*errorHandler == NULL)
3137 return NULL;
3138 }
3139
3140 make_translate_exception(exceptionObject,
3141 unicode, size, startpos, endpos, reason);
3142 if (*exceptionObject == NULL)
3143 return NULL;
3144
3145 restuple = PyObject_CallFunctionObjArgs(
3146 *errorHandler, *exceptionObject, NULL);
3147 if (restuple == NULL)
3148 return NULL;
3149 if (!PyTuple_Check(restuple)) {
3150 PyErr_Format(PyExc_TypeError, &argparse[4]);
3151 Py_DECREF(restuple);
3152 return NULL;
3153 }
3154 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3155 &resunicode, newpos)) {
3156 Py_DECREF(restuple);
3157 return NULL;
3158 }
3159 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003160 *newpos = size+*newpos;
3161 if (*newpos<0 || *newpos>size) {
3162 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3163 Py_DECREF(restuple);
3164 return NULL;
3165 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003166 Py_INCREF(resunicode);
3167 Py_DECREF(restuple);
3168 return resunicode;
3169}
3170
3171/* Lookup the character ch in the mapping and put the result in result,
3172 which must be decrefed by the caller.
3173 Return 0 on success, -1 on error */
3174static
3175int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3176{
3177 PyObject *w = PyInt_FromLong((long)c);
3178 PyObject *x;
3179
3180 if (w == NULL)
3181 return -1;
3182 x = PyObject_GetItem(mapping, w);
3183 Py_DECREF(w);
3184 if (x == NULL) {
3185 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3186 /* No mapping found means: use 1:1 mapping. */
3187 PyErr_Clear();
3188 *result = NULL;
3189 return 0;
3190 } else
3191 return -1;
3192 }
3193 else if (x == Py_None) {
3194 *result = x;
3195 return 0;
3196 }
3197 else if (PyInt_Check(x)) {
3198 long value = PyInt_AS_LONG(x);
3199 long max = PyUnicode_GetMax();
3200 if (value < 0 || value > max) {
3201 PyErr_Format(PyExc_TypeError,
3202 "character mapping must be in range(0x%lx)", max+1);
3203 Py_DECREF(x);
3204 return -1;
3205 }
3206 *result = x;
3207 return 0;
3208 }
3209 else if (PyUnicode_Check(x)) {
3210 *result = x;
3211 return 0;
3212 }
3213 else {
3214 /* wrong return value */
3215 PyErr_SetString(PyExc_TypeError,
3216 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003217 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003218 return -1;
3219 }
3220}
3221/* ensure that *outobj is at least requiredsize characters long,
3222if not reallocate and adjust various state variables.
3223Return 0 on success, -1 on error */
3224static
3225int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize,
3226 int requiredsize)
3227{
3228 if (requiredsize > *outsize) {
3229 /* remember old output position */
3230 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3231 /* exponentially overallocate to minimize reallocations */
3232 if (requiredsize < 2 * *outsize)
3233 requiredsize = 2 * *outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003234 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003235 return -1;
3236 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3237 *outsize = requiredsize;
3238 }
3239 return 0;
3240}
3241/* lookup the character, put the result in the output string and adjust
3242 various state variables. Return a new reference to the object that
3243 was put in the output buffer in *result, or Py_None, if the mapping was
3244 undefined (in which case no character was written).
3245 The called must decref result.
3246 Return 0 on success, -1 on error. */
3247static
3248int charmaptranslate_output(Py_UNICODE c, PyObject *mapping,
3249 PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res)
3250{
3251 if (charmaptranslate_lookup(c, mapping, res))
3252 return -1;
3253 if (*res==NULL) {
3254 /* not found => default to 1:1 mapping */
3255 *(*outp)++ = (Py_UNICODE)c;
3256 }
3257 else if (*res==Py_None)
3258 ;
3259 else if (PyInt_Check(*res)) {
3260 /* no overflow check, because we know that the space is enough */
3261 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3262 }
3263 else if (PyUnicode_Check(*res)) {
3264 int repsize = PyUnicode_GET_SIZE(*res);
3265 if (repsize==1) {
3266 /* no overflow check, because we know that the space is enough */
3267 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3268 }
3269 else if (repsize!=0) {
3270 /* more than one character */
3271 int requiredsize = *outsize + repsize - 1;
3272 if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize))
3273 return -1;
3274 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3275 *outp += repsize;
3276 }
3277 }
3278 else
3279 return -1;
3280 return 0;
3281}
3282
3283PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003284 int size,
3285 PyObject *mapping,
3286 const char *errors)
3287{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003288 /* output object */
3289 PyObject *res = NULL;
3290 /* pointers to the beginning and end+1 of input */
3291 const Py_UNICODE *startp = p;
3292 const Py_UNICODE *endp = p + size;
3293 /* pointer into the output */
3294 Py_UNICODE *str;
3295 /* current output position */
3296 int respos = 0;
3297 int ressize;
3298 char *reason = "character maps to <undefined>";
3299 PyObject *errorHandler = NULL;
3300 PyObject *exc = NULL;
3301 /* the following variable is used for caching string comparisons
3302 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3303 * 3=ignore, 4=xmlcharrefreplace */
3304 int known_errorHandler = -1;
3305
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 if (mapping == NULL) {
3307 PyErr_BadArgument();
3308 return NULL;
3309 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003310
3311 /* allocate enough for a simple 1:1 translation without
3312 replacements, if we need more, we'll resize */
3313 res = PyUnicode_FromUnicode(NULL, size);
3314 if (res == NULL)
3315 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003317 return res;
3318 str = PyUnicode_AS_UNICODE(res);
3319 ressize = size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003321 while (p<endp) {
3322 /* try to encode it */
3323 PyObject *x = NULL;
3324 if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) {
3325 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326 goto onError;
3327 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003328 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003329 if (x!=Py_None) /* it worked => adjust input pointer */
3330 ++p;
3331 else { /* untranslatable character */
3332 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3333 int repsize;
3334 int newpos;
3335 Py_UNICODE *uni2;
3336 /* startpos for collecting untranslatable chars */
3337 const Py_UNICODE *collstart = p;
3338 const Py_UNICODE *collend = p+1;
3339 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003341 /* find all untranslatable characters */
3342 while (collend < endp) {
3343 if (charmaptranslate_lookup(*collend, mapping, &x))
3344 goto onError;
3345 Py_XDECREF(x);
3346 if (x!=Py_None)
3347 break;
3348 ++collend;
3349 }
3350 /* cache callback name lookup
3351 * (if not done yet, i.e. it's the first error) */
3352 if (known_errorHandler==-1) {
3353 if ((errors==NULL) || (!strcmp(errors, "strict")))
3354 known_errorHandler = 1;
3355 else if (!strcmp(errors, "replace"))
3356 known_errorHandler = 2;
3357 else if (!strcmp(errors, "ignore"))
3358 known_errorHandler = 3;
3359 else if (!strcmp(errors, "xmlcharrefreplace"))
3360 known_errorHandler = 4;
3361 else
3362 known_errorHandler = 0;
3363 }
3364 switch (known_errorHandler) {
3365 case 1: /* strict */
3366 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3367 goto onError;
3368 case 2: /* replace */
3369 /* No need to check for space, this is a 1:1 replacement */
3370 for (coll = collstart; coll<collend; ++coll)
3371 *str++ = '?';
3372 /* fall through */
3373 case 3: /* ignore */
3374 p = collend;
3375 break;
3376 case 4: /* xmlcharrefreplace */
3377 /* generate replacement (temporarily (mis)uses p) */
3378 for (p = collstart; p < collend; ++p) {
3379 char buffer[2+29+1+1];
3380 char *cp;
3381 sprintf(buffer, "&#%d;", (int)*p);
3382 if (charmaptranslate_makespace(&res, &str, &ressize,
3383 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3384 goto onError;
3385 for (cp = buffer; *cp; ++cp)
3386 *str++ = *cp;
3387 }
3388 p = collend;
3389 break;
3390 default:
3391 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3392 reason, startp, size, &exc,
3393 collstart-startp, collend-startp, &newpos);
3394 if (repunicode == NULL)
3395 goto onError;
3396 /* generate replacement */
3397 repsize = PyUnicode_GET_SIZE(repunicode);
3398 if (charmaptranslate_makespace(&res, &str, &ressize,
3399 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3400 Py_DECREF(repunicode);
3401 goto onError;
3402 }
3403 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3404 *str++ = *uni2;
3405 p = startp + newpos;
3406 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 }
3408 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003409 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003410 /* Resize if we allocated to much */
3411 respos = str-PyUnicode_AS_UNICODE(res);
3412 if (respos<ressize) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003413 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003414 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003415 }
3416 Py_XDECREF(exc);
3417 Py_XDECREF(errorHandler);
3418 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003420 onError:
3421 Py_XDECREF(res);
3422 Py_XDECREF(exc);
3423 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003424 return NULL;
3425}
3426
3427PyObject *PyUnicode_Translate(PyObject *str,
3428 PyObject *mapping,
3429 const char *errors)
3430{
3431 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003432
Guido van Rossumd57fd912000-03-10 22:53:23 +00003433 str = PyUnicode_FromObject(str);
3434 if (str == NULL)
3435 goto onError;
3436 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3437 PyUnicode_GET_SIZE(str),
3438 mapping,
3439 errors);
3440 Py_DECREF(str);
3441 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003442
Guido van Rossumd57fd912000-03-10 22:53:23 +00003443 onError:
3444 Py_XDECREF(str);
3445 return NULL;
3446}
Tim Petersced69f82003-09-16 20:30:58 +00003447
Guido van Rossum9e896b32000-04-05 20:11:21 +00003448/* --- Decimal Encoder ---------------------------------------------------- */
3449
3450int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3451 int length,
3452 char *output,
3453 const char *errors)
3454{
3455 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003456 PyObject *errorHandler = NULL;
3457 PyObject *exc = NULL;
3458 const char *encoding = "decimal";
3459 const char *reason = "invalid decimal Unicode string";
3460 /* the following variable is used for caching string comparisons
3461 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3462 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003463
3464 if (output == NULL) {
3465 PyErr_BadArgument();
3466 return -1;
3467 }
3468
3469 p = s;
3470 end = s + length;
3471 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003472 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003473 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003474 PyObject *repunicode;
3475 int repsize;
3476 int newpos;
3477 Py_UNICODE *uni2;
3478 Py_UNICODE *collstart;
3479 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003480
Guido van Rossum9e896b32000-04-05 20:11:21 +00003481 if (Py_UNICODE_ISSPACE(ch)) {
3482 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003483 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003484 continue;
3485 }
3486 decimal = Py_UNICODE_TODECIMAL(ch);
3487 if (decimal >= 0) {
3488 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003489 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003490 continue;
3491 }
Guido van Rossumba477042000-04-06 18:18:10 +00003492 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003493 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003494 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003495 continue;
3496 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497 /* All other characters are considered unencodable */
3498 collstart = p;
3499 collend = p+1;
3500 while (collend < end) {
3501 if ((0 < *collend && *collend < 256) ||
3502 !Py_UNICODE_ISSPACE(*collend) ||
3503 Py_UNICODE_TODECIMAL(*collend))
3504 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003505 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506 /* cache callback name lookup
3507 * (if not done yet, i.e. it's the first error) */
3508 if (known_errorHandler==-1) {
3509 if ((errors==NULL) || (!strcmp(errors, "strict")))
3510 known_errorHandler = 1;
3511 else if (!strcmp(errors, "replace"))
3512 known_errorHandler = 2;
3513 else if (!strcmp(errors, "ignore"))
3514 known_errorHandler = 3;
3515 else if (!strcmp(errors, "xmlcharrefreplace"))
3516 known_errorHandler = 4;
3517 else
3518 known_errorHandler = 0;
3519 }
3520 switch (known_errorHandler) {
3521 case 1: /* strict */
3522 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3523 goto onError;
3524 case 2: /* replace */
3525 for (p = collstart; p < collend; ++p)
3526 *output++ = '?';
3527 /* fall through */
3528 case 3: /* ignore */
3529 p = collend;
3530 break;
3531 case 4: /* xmlcharrefreplace */
3532 /* generate replacement (temporarily (mis)uses p) */
3533 for (p = collstart; p < collend; ++p)
3534 output += sprintf(output, "&#%d;", (int)*p);
3535 p = collend;
3536 break;
3537 default:
3538 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3539 encoding, reason, s, length, &exc,
3540 collstart-s, collend-s, &newpos);
3541 if (repunicode == NULL)
3542 goto onError;
3543 /* generate replacement */
3544 repsize = PyUnicode_GET_SIZE(repunicode);
3545 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3546 Py_UNICODE ch = *uni2;
3547 if (Py_UNICODE_ISSPACE(ch))
3548 *output++ = ' ';
3549 else {
3550 decimal = Py_UNICODE_TODECIMAL(ch);
3551 if (decimal >= 0)
3552 *output++ = '0' + decimal;
3553 else if (0 < ch && ch < 256)
3554 *output++ = (char)ch;
3555 else {
3556 Py_DECREF(repunicode);
3557 raise_encode_exception(&exc, encoding,
3558 s, length, collstart-s, collend-s, reason);
3559 goto onError;
3560 }
3561 }
3562 }
3563 p = s + newpos;
3564 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003565 }
3566 }
3567 /* 0-terminate the output string */
3568 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569 Py_XDECREF(exc);
3570 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003571 return 0;
3572
3573 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 Py_XDECREF(exc);
3575 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003576 return -1;
3577}
3578
Guido van Rossumd57fd912000-03-10 22:53:23 +00003579/* --- Helpers ------------------------------------------------------------ */
3580
Tim Petersced69f82003-09-16 20:30:58 +00003581static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003582int count(PyUnicodeObject *self,
3583 int start,
3584 int end,
3585 PyUnicodeObject *substring)
3586{
3587 int count = 0;
3588
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003589 if (start < 0)
3590 start += self->length;
3591 if (start < 0)
3592 start = 0;
3593 if (end > self->length)
3594 end = self->length;
3595 if (end < 0)
3596 end += self->length;
3597 if (end < 0)
3598 end = 0;
3599
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003600 if (substring->length == 0)
3601 return (end - start + 1);
3602
Guido van Rossumd57fd912000-03-10 22:53:23 +00003603 end -= substring->length;
3604
3605 while (start <= end)
3606 if (Py_UNICODE_MATCH(self, start, substring)) {
3607 count++;
3608 start += substring->length;
3609 } else
3610 start++;
3611
3612 return count;
3613}
3614
3615int PyUnicode_Count(PyObject *str,
3616 PyObject *substr,
3617 int start,
3618 int end)
3619{
3620 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003621
Guido van Rossumd57fd912000-03-10 22:53:23 +00003622 str = PyUnicode_FromObject(str);
3623 if (str == NULL)
3624 return -1;
3625 substr = PyUnicode_FromObject(substr);
3626 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003627 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628 return -1;
3629 }
Tim Petersced69f82003-09-16 20:30:58 +00003630
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631 result = count((PyUnicodeObject *)str,
3632 start, end,
3633 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003634
Guido van Rossumd57fd912000-03-10 22:53:23 +00003635 Py_DECREF(str);
3636 Py_DECREF(substr);
3637 return result;
3638}
3639
Tim Petersced69f82003-09-16 20:30:58 +00003640static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641int findstring(PyUnicodeObject *self,
3642 PyUnicodeObject *substring,
3643 int start,
3644 int end,
3645 int direction)
3646{
3647 if (start < 0)
3648 start += self->length;
3649 if (start < 0)
3650 start = 0;
3651
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 if (end > self->length)
3653 end = self->length;
3654 if (end < 0)
3655 end += self->length;
3656 if (end < 0)
3657 end = 0;
3658
Guido van Rossum76afbd92002-08-20 17:29:29 +00003659 if (substring->length == 0)
3660 return (direction > 0) ? start : end;
3661
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662 end -= substring->length;
3663
3664 if (direction < 0) {
3665 for (; end >= start; end--)
3666 if (Py_UNICODE_MATCH(self, end, substring))
3667 return end;
3668 } else {
3669 for (; start <= end; start++)
3670 if (Py_UNICODE_MATCH(self, start, substring))
3671 return start;
3672 }
3673
3674 return -1;
3675}
3676
3677int PyUnicode_Find(PyObject *str,
3678 PyObject *substr,
3679 int start,
3680 int end,
3681 int direction)
3682{
3683 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003684
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685 str = PyUnicode_FromObject(str);
3686 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003687 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688 substr = PyUnicode_FromObject(substr);
3689 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003690 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003691 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692 }
Tim Petersced69f82003-09-16 20:30:58 +00003693
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694 result = findstring((PyUnicodeObject *)str,
3695 (PyUnicodeObject *)substr,
3696 start, end, direction);
3697 Py_DECREF(str);
3698 Py_DECREF(substr);
3699 return result;
3700}
3701
Tim Petersced69f82003-09-16 20:30:58 +00003702static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703int tailmatch(PyUnicodeObject *self,
3704 PyUnicodeObject *substring,
3705 int start,
3706 int end,
3707 int direction)
3708{
3709 if (start < 0)
3710 start += self->length;
3711 if (start < 0)
3712 start = 0;
3713
3714 if (substring->length == 0)
3715 return 1;
3716
3717 if (end > self->length)
3718 end = self->length;
3719 if (end < 0)
3720 end += self->length;
3721 if (end < 0)
3722 end = 0;
3723
3724 end -= substring->length;
3725 if (end < start)
3726 return 0;
3727
3728 if (direction > 0) {
3729 if (Py_UNICODE_MATCH(self, end, substring))
3730 return 1;
3731 } else {
3732 if (Py_UNICODE_MATCH(self, start, substring))
3733 return 1;
3734 }
3735
3736 return 0;
3737}
3738
3739int PyUnicode_Tailmatch(PyObject *str,
3740 PyObject *substr,
3741 int start,
3742 int end,
3743 int direction)
3744{
3745 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003746
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747 str = PyUnicode_FromObject(str);
3748 if (str == NULL)
3749 return -1;
3750 substr = PyUnicode_FromObject(substr);
3751 if (substr == NULL) {
3752 Py_DECREF(substr);
3753 return -1;
3754 }
Tim Petersced69f82003-09-16 20:30:58 +00003755
Guido van Rossumd57fd912000-03-10 22:53:23 +00003756 result = tailmatch((PyUnicodeObject *)str,
3757 (PyUnicodeObject *)substr,
3758 start, end, direction);
3759 Py_DECREF(str);
3760 Py_DECREF(substr);
3761 return result;
3762}
3763
Tim Petersced69f82003-09-16 20:30:58 +00003764static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765const Py_UNICODE *findchar(const Py_UNICODE *s,
3766 int size,
3767 Py_UNICODE ch)
3768{
3769 /* like wcschr, but doesn't stop at NULL characters */
3770
3771 while (size-- > 0) {
3772 if (*s == ch)
3773 return s;
3774 s++;
3775 }
3776
3777 return NULL;
3778}
3779
3780/* Apply fixfct filter to the Unicode object self and return a
3781 reference to the modified object */
3782
Tim Petersced69f82003-09-16 20:30:58 +00003783static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784PyObject *fixup(PyUnicodeObject *self,
3785 int (*fixfct)(PyUnicodeObject *s))
3786{
3787
3788 PyUnicodeObject *u;
3789
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003790 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003791 if (u == NULL)
3792 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003793
3794 Py_UNICODE_COPY(u->str, self->str, self->length);
3795
Tim Peters7a29bd52001-09-12 03:03:31 +00003796 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797 /* fixfct should return TRUE if it modified the buffer. If
3798 FALSE, return a reference to the original buffer instead
3799 (to save space, not time) */
3800 Py_INCREF(self);
3801 Py_DECREF(u);
3802 return (PyObject*) self;
3803 }
3804 return (PyObject*) u;
3805}
3806
Tim Petersced69f82003-09-16 20:30:58 +00003807static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808int fixupper(PyUnicodeObject *self)
3809{
3810 int len = self->length;
3811 Py_UNICODE *s = self->str;
3812 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003813
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 while (len-- > 0) {
3815 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003816
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 ch = Py_UNICODE_TOUPPER(*s);
3818 if (ch != *s) {
3819 status = 1;
3820 *s = ch;
3821 }
3822 s++;
3823 }
3824
3825 return status;
3826}
3827
Tim Petersced69f82003-09-16 20:30:58 +00003828static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829int fixlower(PyUnicodeObject *self)
3830{
3831 int len = self->length;
3832 Py_UNICODE *s = self->str;
3833 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003834
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835 while (len-- > 0) {
3836 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003837
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 ch = Py_UNICODE_TOLOWER(*s);
3839 if (ch != *s) {
3840 status = 1;
3841 *s = ch;
3842 }
3843 s++;
3844 }
3845
3846 return status;
3847}
3848
Tim Petersced69f82003-09-16 20:30:58 +00003849static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003850int fixswapcase(PyUnicodeObject *self)
3851{
3852 int len = self->length;
3853 Py_UNICODE *s = self->str;
3854 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003855
Guido van Rossumd57fd912000-03-10 22:53:23 +00003856 while (len-- > 0) {
3857 if (Py_UNICODE_ISUPPER(*s)) {
3858 *s = Py_UNICODE_TOLOWER(*s);
3859 status = 1;
3860 } else if (Py_UNICODE_ISLOWER(*s)) {
3861 *s = Py_UNICODE_TOUPPER(*s);
3862 status = 1;
3863 }
3864 s++;
3865 }
3866
3867 return status;
3868}
3869
Tim Petersced69f82003-09-16 20:30:58 +00003870static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871int fixcapitalize(PyUnicodeObject *self)
3872{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003873 int len = self->length;
3874 Py_UNICODE *s = self->str;
3875 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003876
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003877 if (len == 0)
3878 return 0;
3879 if (Py_UNICODE_ISLOWER(*s)) {
3880 *s = Py_UNICODE_TOUPPER(*s);
3881 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003882 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003883 s++;
3884 while (--len > 0) {
3885 if (Py_UNICODE_ISUPPER(*s)) {
3886 *s = Py_UNICODE_TOLOWER(*s);
3887 status = 1;
3888 }
3889 s++;
3890 }
3891 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003892}
3893
3894static
3895int fixtitle(PyUnicodeObject *self)
3896{
3897 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3898 register Py_UNICODE *e;
3899 int previous_is_cased;
3900
3901 /* Shortcut for single character strings */
3902 if (PyUnicode_GET_SIZE(self) == 1) {
3903 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3904 if (*p != ch) {
3905 *p = ch;
3906 return 1;
3907 }
3908 else
3909 return 0;
3910 }
Tim Petersced69f82003-09-16 20:30:58 +00003911
Guido van Rossumd57fd912000-03-10 22:53:23 +00003912 e = p + PyUnicode_GET_SIZE(self);
3913 previous_is_cased = 0;
3914 for (; p < e; p++) {
3915 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00003916
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917 if (previous_is_cased)
3918 *p = Py_UNICODE_TOLOWER(ch);
3919 else
3920 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00003921
3922 if (Py_UNICODE_ISLOWER(ch) ||
3923 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00003924 Py_UNICODE_ISTITLE(ch))
3925 previous_is_cased = 1;
3926 else
3927 previous_is_cased = 0;
3928 }
3929 return 1;
3930}
3931
3932PyObject *PyUnicode_Join(PyObject *separator,
3933 PyObject *seq)
3934{
3935 Py_UNICODE *sep;
3936 int seplen;
3937 PyUnicodeObject *res = NULL;
3938 int reslen = 0;
3939 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940 int sz = 100;
3941 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003942 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943
Tim Peters2cfe3682001-05-05 05:36:48 +00003944 it = PyObject_GetIter(seq);
3945 if (it == NULL)
3946 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947
3948 if (separator == NULL) {
3949 Py_UNICODE blank = ' ';
3950 sep = &blank;
3951 seplen = 1;
3952 }
3953 else {
3954 separator = PyUnicode_FromObject(separator);
3955 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003956 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 sep = PyUnicode_AS_UNICODE(separator);
3958 seplen = PyUnicode_GET_SIZE(separator);
3959 }
Tim Petersced69f82003-09-16 20:30:58 +00003960
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 res = _PyUnicode_New(sz);
3962 if (res == NULL)
3963 goto onError;
3964 p = PyUnicode_AS_UNICODE(res);
3965 reslen = 0;
3966
Tim Peters2cfe3682001-05-05 05:36:48 +00003967 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003969 PyObject *item = PyIter_Next(it);
3970 if (item == NULL) {
3971 if (PyErr_Occurred())
3972 goto onError;
3973 break;
3974 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 if (!PyUnicode_Check(item)) {
3976 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003977 if (!PyString_Check(item)) {
3978 PyErr_Format(PyExc_TypeError,
3979 "sequence item %i: expected string or Unicode,"
3980 " %.80s found",
3981 i, item->ob_type->tp_name);
3982 Py_DECREF(item);
3983 goto onError;
3984 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985 v = PyUnicode_FromObject(item);
3986 Py_DECREF(item);
3987 item = v;
3988 if (item == NULL)
3989 goto onError;
3990 }
3991 itemlen = PyUnicode_GET_SIZE(item);
3992 while (reslen + itemlen + seplen >= sz) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003993 if (_PyUnicode_Resize(&res, sz*2) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003994 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997 sz *= 2;
3998 p = PyUnicode_AS_UNICODE(res) + reslen;
3999 }
4000 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004001 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004002 p += seplen;
4003 reslen += seplen;
4004 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004005 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004006 p += itemlen;
4007 reslen += itemlen;
4008 Py_DECREF(item);
4009 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004010 if (_PyUnicode_Resize(&res, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011 goto onError;
4012
4013 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004014 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015 return (PyObject *)res;
4016
4017 onError:
4018 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004019 Py_XDECREF(res);
4020 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021 return NULL;
4022}
4023
Tim Petersced69f82003-09-16 20:30:58 +00004024static
4025PyUnicodeObject *pad(PyUnicodeObject *self,
4026 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027 int right,
4028 Py_UNICODE fill)
4029{
4030 PyUnicodeObject *u;
4031
4032 if (left < 0)
4033 left = 0;
4034 if (right < 0)
4035 right = 0;
4036
Tim Peters7a29bd52001-09-12 03:03:31 +00004037 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038 Py_INCREF(self);
4039 return self;
4040 }
4041
4042 u = _PyUnicode_New(left + self->length + right);
4043 if (u) {
4044 if (left)
4045 Py_UNICODE_FILL(u->str, fill, left);
4046 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4047 if (right)
4048 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4049 }
4050
4051 return u;
4052}
4053
4054#define SPLIT_APPEND(data, left, right) \
4055 str = PyUnicode_FromUnicode(data + left, right - left); \
4056 if (!str) \
4057 goto onError; \
4058 if (PyList_Append(list, str)) { \
4059 Py_DECREF(str); \
4060 goto onError; \
4061 } \
4062 else \
4063 Py_DECREF(str);
4064
4065static
4066PyObject *split_whitespace(PyUnicodeObject *self,
4067 PyObject *list,
4068 int maxcount)
4069{
4070 register int i;
4071 register int j;
4072 int len = self->length;
4073 PyObject *str;
4074
4075 for (i = j = 0; i < len; ) {
4076 /* find a token */
4077 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4078 i++;
4079 j = i;
4080 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4081 i++;
4082 if (j < i) {
4083 if (maxcount-- <= 0)
4084 break;
4085 SPLIT_APPEND(self->str, j, i);
4086 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4087 i++;
4088 j = i;
4089 }
4090 }
4091 if (j < len) {
4092 SPLIT_APPEND(self->str, j, len);
4093 }
4094 return list;
4095
4096 onError:
4097 Py_DECREF(list);
4098 return NULL;
4099}
4100
4101PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004102 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103{
4104 register int i;
4105 register int j;
4106 int len;
4107 PyObject *list;
4108 PyObject *str;
4109 Py_UNICODE *data;
4110
4111 string = PyUnicode_FromObject(string);
4112 if (string == NULL)
4113 return NULL;
4114 data = PyUnicode_AS_UNICODE(string);
4115 len = PyUnicode_GET_SIZE(string);
4116
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117 list = PyList_New(0);
4118 if (!list)
4119 goto onError;
4120
4121 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004122 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004123
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124 /* Find a line and append it */
4125 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4126 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127
4128 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004129 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130 if (i < len) {
4131 if (data[i] == '\r' && i + 1 < len &&
4132 data[i+1] == '\n')
4133 i += 2;
4134 else
4135 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004136 if (keepends)
4137 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 }
Guido van Rossum86662912000-04-11 15:38:46 +00004139 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 j = i;
4141 }
4142 if (j < len) {
4143 SPLIT_APPEND(data, j, len);
4144 }
4145
4146 Py_DECREF(string);
4147 return list;
4148
4149 onError:
4150 Py_DECREF(list);
4151 Py_DECREF(string);
4152 return NULL;
4153}
4154
Tim Petersced69f82003-09-16 20:30:58 +00004155static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156PyObject *split_char(PyUnicodeObject *self,
4157 PyObject *list,
4158 Py_UNICODE ch,
4159 int maxcount)
4160{
4161 register int i;
4162 register int j;
4163 int len = self->length;
4164 PyObject *str;
4165
4166 for (i = j = 0; i < len; ) {
4167 if (self->str[i] == ch) {
4168 if (maxcount-- <= 0)
4169 break;
4170 SPLIT_APPEND(self->str, j, i);
4171 i = j = i + 1;
4172 } else
4173 i++;
4174 }
4175 if (j <= len) {
4176 SPLIT_APPEND(self->str, j, len);
4177 }
4178 return list;
4179
4180 onError:
4181 Py_DECREF(list);
4182 return NULL;
4183}
4184
Tim Petersced69f82003-09-16 20:30:58 +00004185static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186PyObject *split_substring(PyUnicodeObject *self,
4187 PyObject *list,
4188 PyUnicodeObject *substring,
4189 int maxcount)
4190{
4191 register int i;
4192 register int j;
4193 int len = self->length;
4194 int sublen = substring->length;
4195 PyObject *str;
4196
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004197 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198 if (Py_UNICODE_MATCH(self, i, substring)) {
4199 if (maxcount-- <= 0)
4200 break;
4201 SPLIT_APPEND(self->str, j, i);
4202 i = j = i + sublen;
4203 } else
4204 i++;
4205 }
4206 if (j <= len) {
4207 SPLIT_APPEND(self->str, j, len);
4208 }
4209 return list;
4210
4211 onError:
4212 Py_DECREF(list);
4213 return NULL;
4214}
4215
4216#undef SPLIT_APPEND
4217
4218static
4219PyObject *split(PyUnicodeObject *self,
4220 PyUnicodeObject *substring,
4221 int maxcount)
4222{
4223 PyObject *list;
4224
4225 if (maxcount < 0)
4226 maxcount = INT_MAX;
4227
4228 list = PyList_New(0);
4229 if (!list)
4230 return NULL;
4231
4232 if (substring == NULL)
4233 return split_whitespace(self,list,maxcount);
4234
4235 else if (substring->length == 1)
4236 return split_char(self,list,substring->str[0],maxcount);
4237
4238 else if (substring->length == 0) {
4239 Py_DECREF(list);
4240 PyErr_SetString(PyExc_ValueError, "empty separator");
4241 return NULL;
4242 }
4243 else
4244 return split_substring(self,list,substring,maxcount);
4245}
4246
Tim Petersced69f82003-09-16 20:30:58 +00004247static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004248PyObject *replace(PyUnicodeObject *self,
4249 PyUnicodeObject *str1,
4250 PyUnicodeObject *str2,
4251 int maxcount)
4252{
4253 PyUnicodeObject *u;
4254
4255 if (maxcount < 0)
4256 maxcount = INT_MAX;
4257
4258 if (str1->length == 1 && str2->length == 1) {
4259 int i;
4260
4261 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004262 if (!findchar(self->str, self->length, str1->str[0]) &&
4263 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004264 /* nothing to replace, return original string */
4265 Py_INCREF(self);
4266 u = self;
4267 } else {
4268 Py_UNICODE u1 = str1->str[0];
4269 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004270
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004272 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273 self->length
4274 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004275 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004276 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004277 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004278 for (i = 0; i < u->length; i++)
4279 if (u->str[i] == u1) {
4280 if (--maxcount < 0)
4281 break;
4282 u->str[i] = u2;
4283 }
4284 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004285 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286
4287 } else {
4288 int n, i;
4289 Py_UNICODE *p;
4290
4291 /* replace strings */
4292 n = count(self, 0, self->length, str1);
4293 if (n > maxcount)
4294 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004295 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004296 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004297 if (PyUnicode_CheckExact(self)) {
4298 Py_INCREF(self);
4299 u = self;
4300 }
4301 else {
4302 u = (PyUnicodeObject *)
4303 PyUnicode_FromUnicode(self->str, self->length);
4304 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305 } else {
4306 u = _PyUnicode_New(
4307 self->length + n * (str2->length - str1->length));
4308 if (u) {
4309 i = 0;
4310 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004311 if (str1->length > 0) {
4312 while (i <= self->length - str1->length)
4313 if (Py_UNICODE_MATCH(self, i, str1)) {
4314 /* replace string segment */
4315 Py_UNICODE_COPY(p, str2->str, str2->length);
4316 p += str2->length;
4317 i += str1->length;
4318 if (--n <= 0) {
4319 /* copy remaining part */
4320 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4321 break;
4322 }
4323 } else
4324 *p++ = self->str[i++];
4325 } else {
4326 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327 Py_UNICODE_COPY(p, str2->str, str2->length);
4328 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004329 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004332 }
4333 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4334 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004335 }
4336 }
4337 }
Tim Petersced69f82003-09-16 20:30:58 +00004338
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339 return (PyObject *) u;
4340}
4341
4342/* --- Unicode Object Methods --------------------------------------------- */
4343
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004344PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345"S.title() -> unicode\n\
4346\n\
4347Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004348characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349
4350static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004351unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004352{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353 return fixup(self, fixtitle);
4354}
4355
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004356PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357"S.capitalize() -> unicode\n\
4358\n\
4359Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004360have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361
4362static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004363unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365 return fixup(self, fixcapitalize);
4366}
4367
4368#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004369PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370"S.capwords() -> unicode\n\
4371\n\
4372Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004373normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004374
4375static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004376unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004377{
4378 PyObject *list;
4379 PyObject *item;
4380 int i;
4381
Guido van Rossumd57fd912000-03-10 22:53:23 +00004382 /* Split into words */
4383 list = split(self, NULL, -1);
4384 if (!list)
4385 return NULL;
4386
4387 /* Capitalize each word */
4388 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4389 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4390 fixcapitalize);
4391 if (item == NULL)
4392 goto onError;
4393 Py_DECREF(PyList_GET_ITEM(list, i));
4394 PyList_SET_ITEM(list, i, item);
4395 }
4396
4397 /* Join the words to form a new string */
4398 item = PyUnicode_Join(NULL, list);
4399
4400onError:
4401 Py_DECREF(list);
4402 return (PyObject *)item;
4403}
4404#endif
4405
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004406PyDoc_STRVAR(center__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407"S.center(width) -> unicode\n\
4408\n\
4409Return S centered in a Unicode string of length width. Padding is done\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004410using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411
4412static PyObject *
4413unicode_center(PyUnicodeObject *self, PyObject *args)
4414{
4415 int marg, left;
4416 int width;
4417
4418 if (!PyArg_ParseTuple(args, "i:center", &width))
4419 return NULL;
4420
Tim Peters7a29bd52001-09-12 03:03:31 +00004421 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422 Py_INCREF(self);
4423 return (PyObject*) self;
4424 }
4425
4426 marg = width - self->length;
4427 left = marg / 2 + (marg & width & 1);
4428
4429 return (PyObject*) pad(self, left, marg - left, ' ');
4430}
4431
Marc-André Lemburge5034372000-08-08 08:04:29 +00004432#if 0
4433
4434/* This code should go into some future Unicode collation support
4435 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004436 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004437
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004438/* speedy UTF-16 code point order comparison */
4439/* gleaned from: */
4440/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4441
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004442static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004443{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004444 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004445 0, 0, 0, 0, 0, 0, 0, 0,
4446 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004447 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004448};
4449
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450static int
4451unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4452{
4453 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004454
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 Py_UNICODE *s1 = str1->str;
4456 Py_UNICODE *s2 = str2->str;
4457
4458 len1 = str1->length;
4459 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004460
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004462 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004463
4464 c1 = *s1++;
4465 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004466
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004467 if (c1 > (1<<11) * 26)
4468 c1 += utf16Fixup[c1>>11];
4469 if (c2 > (1<<11) * 26)
4470 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004471 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004472
4473 if (c1 != c2)
4474 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004475
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004476 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 }
4478
4479 return (len1 < len2) ? -1 : (len1 != len2);
4480}
4481
Marc-André Lemburge5034372000-08-08 08:04:29 +00004482#else
4483
4484static int
4485unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4486{
4487 register int len1, len2;
4488
4489 Py_UNICODE *s1 = str1->str;
4490 Py_UNICODE *s2 = str2->str;
4491
4492 len1 = str1->length;
4493 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004494
Marc-André Lemburge5034372000-08-08 08:04:29 +00004495 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004496 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004497
Fredrik Lundh45714e92001-06-26 16:39:36 +00004498 c1 = *s1++;
4499 c2 = *s2++;
4500
4501 if (c1 != c2)
4502 return (c1 < c2) ? -1 : 1;
4503
Marc-André Lemburge5034372000-08-08 08:04:29 +00004504 len1--; len2--;
4505 }
4506
4507 return (len1 < len2) ? -1 : (len1 != len2);
4508}
4509
4510#endif
4511
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512int PyUnicode_Compare(PyObject *left,
4513 PyObject *right)
4514{
4515 PyUnicodeObject *u = NULL, *v = NULL;
4516 int result;
4517
4518 /* Coerce the two arguments */
4519 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4520 if (u == NULL)
4521 goto onError;
4522 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4523 if (v == NULL)
4524 goto onError;
4525
Thomas Wouters7e474022000-07-16 12:04:32 +00004526 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004527 if (v == u) {
4528 Py_DECREF(u);
4529 Py_DECREF(v);
4530 return 0;
4531 }
4532
4533 result = unicode_compare(u, v);
4534
4535 Py_DECREF(u);
4536 Py_DECREF(v);
4537 return result;
4538
4539onError:
4540 Py_XDECREF(u);
4541 Py_XDECREF(v);
4542 return -1;
4543}
4544
Guido van Rossum403d68b2000-03-13 15:55:09 +00004545int PyUnicode_Contains(PyObject *container,
4546 PyObject *element)
4547{
4548 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004549 int result, size;
4550 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004551
4552 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004553 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004554 if (v == NULL) {
4555 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004556 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004557 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004558 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004559 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004560 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004561 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004562
Barry Warsaw817918c2002-08-06 16:58:21 +00004563 size = PyUnicode_GET_SIZE(v);
4564 rhs = PyUnicode_AS_UNICODE(v);
4565 lhs = PyUnicode_AS_UNICODE(u);
4566
Guido van Rossum403d68b2000-03-13 15:55:09 +00004567 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004568 if (size == 1) {
4569 end = lhs + PyUnicode_GET_SIZE(u);
4570 while (lhs < end) {
4571 if (*lhs++ == *rhs) {
4572 result = 1;
4573 break;
4574 }
4575 }
4576 }
4577 else {
4578 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4579 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004580 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004581 result = 1;
4582 break;
4583 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004584 }
4585 }
4586
4587 Py_DECREF(u);
4588 Py_DECREF(v);
4589 return result;
4590
4591onError:
4592 Py_XDECREF(u);
4593 Py_XDECREF(v);
4594 return -1;
4595}
4596
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597/* Concat to string or Unicode object giving a new Unicode object. */
4598
4599PyObject *PyUnicode_Concat(PyObject *left,
4600 PyObject *right)
4601{
4602 PyUnicodeObject *u = NULL, *v = NULL, *w;
4603
4604 /* Coerce the two arguments */
4605 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4606 if (u == NULL)
4607 goto onError;
4608 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4609 if (v == NULL)
4610 goto onError;
4611
4612 /* Shortcuts */
4613 if (v == unicode_empty) {
4614 Py_DECREF(v);
4615 return (PyObject *)u;
4616 }
4617 if (u == unicode_empty) {
4618 Py_DECREF(u);
4619 return (PyObject *)v;
4620 }
4621
4622 /* Concat the two Unicode strings */
4623 w = _PyUnicode_New(u->length + v->length);
4624 if (w == NULL)
4625 goto onError;
4626 Py_UNICODE_COPY(w->str, u->str, u->length);
4627 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4628
4629 Py_DECREF(u);
4630 Py_DECREF(v);
4631 return (PyObject *)w;
4632
4633onError:
4634 Py_XDECREF(u);
4635 Py_XDECREF(v);
4636 return NULL;
4637}
4638
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004639PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004640"S.count(sub[, start[, end]]) -> int\n\
4641\n\
4642Return the number of occurrences of substring sub in Unicode string\n\
4643S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004644interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645
4646static PyObject *
4647unicode_count(PyUnicodeObject *self, PyObject *args)
4648{
4649 PyUnicodeObject *substring;
4650 int start = 0;
4651 int end = INT_MAX;
4652 PyObject *result;
4653
Guido van Rossumb8872e62000-05-09 14:14:27 +00004654 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4655 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004656 return NULL;
4657
4658 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4659 (PyObject *)substring);
4660 if (substring == NULL)
4661 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004662
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663 if (start < 0)
4664 start += self->length;
4665 if (start < 0)
4666 start = 0;
4667 if (end > self->length)
4668 end = self->length;
4669 if (end < 0)
4670 end += self->length;
4671 if (end < 0)
4672 end = 0;
4673
4674 result = PyInt_FromLong((long) count(self, start, end, substring));
4675
4676 Py_DECREF(substring);
4677 return result;
4678}
4679
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004680PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004681"S.encode([encoding[,errors]]) -> string\n\
4682\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004683Return an encoded string version of S. Default encoding is the current\n\
4684default string encoding. errors may be given to set a different error\n\
4685handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004686a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4687'xmlcharrefreplace' as well as any other name registered with\n\
4688codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689
4690static PyObject *
4691unicode_encode(PyUnicodeObject *self, PyObject *args)
4692{
4693 char *encoding = NULL;
4694 char *errors = NULL;
4695 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4696 return NULL;
4697 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4698}
4699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004700PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701"S.expandtabs([tabsize]) -> unicode\n\
4702\n\
4703Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004704If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705
4706static PyObject*
4707unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4708{
4709 Py_UNICODE *e;
4710 Py_UNICODE *p;
4711 Py_UNICODE *q;
4712 int i, j;
4713 PyUnicodeObject *u;
4714 int tabsize = 8;
4715
4716 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4717 return NULL;
4718
Thomas Wouters7e474022000-07-16 12:04:32 +00004719 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720 i = j = 0;
4721 e = self->str + self->length;
4722 for (p = self->str; p < e; p++)
4723 if (*p == '\t') {
4724 if (tabsize > 0)
4725 j += tabsize - (j % tabsize);
4726 }
4727 else {
4728 j++;
4729 if (*p == '\n' || *p == '\r') {
4730 i += j;
4731 j = 0;
4732 }
4733 }
4734
4735 /* Second pass: create output string and fill it */
4736 u = _PyUnicode_New(i + j);
4737 if (!u)
4738 return NULL;
4739
4740 j = 0;
4741 q = u->str;
4742
4743 for (p = self->str; p < e; p++)
4744 if (*p == '\t') {
4745 if (tabsize > 0) {
4746 i = tabsize - (j % tabsize);
4747 j += i;
4748 while (i--)
4749 *q++ = ' ';
4750 }
4751 }
4752 else {
4753 j++;
4754 *q++ = *p;
4755 if (*p == '\n' || *p == '\r')
4756 j = 0;
4757 }
4758
4759 return (PyObject*) u;
4760}
4761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004762PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763"S.find(sub [,start [,end]]) -> int\n\
4764\n\
4765Return the lowest index in S where substring sub is found,\n\
4766such that sub is contained within s[start,end]. Optional\n\
4767arguments start and end are interpreted as in slice notation.\n\
4768\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004769Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770
4771static PyObject *
4772unicode_find(PyUnicodeObject *self, PyObject *args)
4773{
4774 PyUnicodeObject *substring;
4775 int start = 0;
4776 int end = INT_MAX;
4777 PyObject *result;
4778
Guido van Rossumb8872e62000-05-09 14:14:27 +00004779 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4780 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781 return NULL;
4782 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4783 (PyObject *)substring);
4784 if (substring == NULL)
4785 return NULL;
4786
4787 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4788
4789 Py_DECREF(substring);
4790 return result;
4791}
4792
4793static PyObject *
4794unicode_getitem(PyUnicodeObject *self, int index)
4795{
4796 if (index < 0 || index >= self->length) {
4797 PyErr_SetString(PyExc_IndexError, "string index out of range");
4798 return NULL;
4799 }
4800
4801 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4802}
4803
4804static long
4805unicode_hash(PyUnicodeObject *self)
4806{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004807 /* Since Unicode objects compare equal to their ASCII string
4808 counterparts, they should use the individual character values
4809 as basis for their hash value. This is needed to assure that
4810 strings and Unicode objects behave in the same way as
4811 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812
Fredrik Lundhdde61642000-07-10 18:27:47 +00004813 register int len;
4814 register Py_UNICODE *p;
4815 register long x;
4816
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 if (self->hash != -1)
4818 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004819 len = PyUnicode_GET_SIZE(self);
4820 p = PyUnicode_AS_UNICODE(self);
4821 x = *p << 7;
4822 while (--len >= 0)
4823 x = (1000003*x) ^ *p++;
4824 x ^= PyUnicode_GET_SIZE(self);
4825 if (x == -1)
4826 x = -2;
4827 self->hash = x;
4828 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829}
4830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004831PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832"S.index(sub [,start [,end]]) -> int\n\
4833\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004834Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835
4836static PyObject *
4837unicode_index(PyUnicodeObject *self, PyObject *args)
4838{
4839 int result;
4840 PyUnicodeObject *substring;
4841 int start = 0;
4842 int end = INT_MAX;
4843
Guido van Rossumb8872e62000-05-09 14:14:27 +00004844 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4845 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004847
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4849 (PyObject *)substring);
4850 if (substring == NULL)
4851 return NULL;
4852
4853 result = findstring(self, substring, start, end, 1);
4854
4855 Py_DECREF(substring);
4856 if (result < 0) {
4857 PyErr_SetString(PyExc_ValueError, "substring not found");
4858 return NULL;
4859 }
4860 return PyInt_FromLong(result);
4861}
4862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004863PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004864"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004866Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004867at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868
4869static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004870unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871{
4872 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4873 register const Py_UNICODE *e;
4874 int cased;
4875
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 /* Shortcut for single character strings */
4877 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004878 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004880 /* Special case for empty strings */
4881 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004882 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004883
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884 e = p + PyUnicode_GET_SIZE(self);
4885 cased = 0;
4886 for (; p < e; p++) {
4887 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004888
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004890 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 else if (!cased && Py_UNICODE_ISLOWER(ch))
4892 cased = 1;
4893 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004894 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895}
4896
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004897PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004898"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004900Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004901at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902
4903static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004904unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905{
4906 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4907 register const Py_UNICODE *e;
4908 int cased;
4909
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910 /* Shortcut for single character strings */
4911 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004912 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004914 /* Special case for empty strings */
4915 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004916 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004917
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918 e = p + PyUnicode_GET_SIZE(self);
4919 cased = 0;
4920 for (; p < e; p++) {
4921 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004922
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004924 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925 else if (!cased && Py_UNICODE_ISUPPER(ch))
4926 cased = 1;
4927 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004928 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929}
4930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004931PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004932"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004934Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4935characters may only follow uncased characters and lowercase characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004936only cased ones. Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937
4938static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004939unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940{
4941 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4942 register const Py_UNICODE *e;
4943 int cased, previous_is_cased;
4944
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945 /* Shortcut for single character strings */
4946 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004947 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4948 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004950 /* Special case for empty strings */
4951 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004952 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004953
Guido van Rossumd57fd912000-03-10 22:53:23 +00004954 e = p + PyUnicode_GET_SIZE(self);
4955 cased = 0;
4956 previous_is_cased = 0;
4957 for (; p < e; p++) {
4958 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004959
Guido van Rossumd57fd912000-03-10 22:53:23 +00004960 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4961 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004962 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963 previous_is_cased = 1;
4964 cased = 1;
4965 }
4966 else if (Py_UNICODE_ISLOWER(ch)) {
4967 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004968 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004969 previous_is_cased = 1;
4970 cased = 1;
4971 }
4972 else
4973 previous_is_cased = 0;
4974 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004975 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004976}
4977
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004978PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004979"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004980\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004981Return True if there are only whitespace characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004982False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004983
4984static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004985unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986{
4987 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4988 register const Py_UNICODE *e;
4989
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990 /* Shortcut for single character strings */
4991 if (PyUnicode_GET_SIZE(self) == 1 &&
4992 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004993 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004994
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004995 /* Special case for empty strings */
4996 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004997 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004998
Guido van Rossumd57fd912000-03-10 22:53:23 +00004999 e = p + PyUnicode_GET_SIZE(self);
5000 for (; p < e; p++) {
5001 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005002 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005003 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005004 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005005}
5006
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005007PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005008"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005009\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005010Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005011and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005012
5013static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005014unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005015{
5016 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5017 register const Py_UNICODE *e;
5018
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005019 /* Shortcut for single character strings */
5020 if (PyUnicode_GET_SIZE(self) == 1 &&
5021 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005022 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005023
5024 /* Special case for empty strings */
5025 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005026 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005027
5028 e = p + PyUnicode_GET_SIZE(self);
5029 for (; p < e; p++) {
5030 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005031 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005032 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005033 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005034}
5035
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005036PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005037"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005038\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005039Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005040and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005041
5042static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005043unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005044{
5045 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5046 register const Py_UNICODE *e;
5047
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005048 /* Shortcut for single character strings */
5049 if (PyUnicode_GET_SIZE(self) == 1 &&
5050 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005051 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005052
5053 /* Special case for empty strings */
5054 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005055 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005056
5057 e = p + PyUnicode_GET_SIZE(self);
5058 for (; p < e; p++) {
5059 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005060 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005061 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005062 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005063}
5064
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005065PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005066"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005068Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005069False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070
5071static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005072unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073{
5074 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5075 register const Py_UNICODE *e;
5076
Guido van Rossumd57fd912000-03-10 22:53:23 +00005077 /* Shortcut for single character strings */
5078 if (PyUnicode_GET_SIZE(self) == 1 &&
5079 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005080 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005082 /* Special case for empty strings */
5083 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005084 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005085
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086 e = p + PyUnicode_GET_SIZE(self);
5087 for (; p < e; p++) {
5088 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005089 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005091 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092}
5093
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005094PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005095"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005097Return True if there are only digit characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005098False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099
5100static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005101unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102{
5103 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5104 register const Py_UNICODE *e;
5105
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106 /* Shortcut for single character strings */
5107 if (PyUnicode_GET_SIZE(self) == 1 &&
5108 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005109 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005111 /* Special case for empty strings */
5112 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005113 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005114
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115 e = p + PyUnicode_GET_SIZE(self);
5116 for (; p < e; p++) {
5117 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005118 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005120 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121}
5122
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005123PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005124"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005126Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005127False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005128
5129static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005130unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131{
5132 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5133 register const Py_UNICODE *e;
5134
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135 /* Shortcut for single character strings */
5136 if (PyUnicode_GET_SIZE(self) == 1 &&
5137 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005138 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005140 /* Special case for empty strings */
5141 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005142 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005143
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144 e = p + PyUnicode_GET_SIZE(self);
5145 for (; p < e; p++) {
5146 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005147 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005149 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150}
5151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005152PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153"S.join(sequence) -> unicode\n\
5154\n\
5155Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005156sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157
5158static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005159unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005160{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005161 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162}
5163
5164static int
5165unicode_length(PyUnicodeObject *self)
5166{
5167 return self->length;
5168}
5169
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005170PyDoc_STRVAR(ljust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171"S.ljust(width) -> unicode\n\
5172\n\
5173Return S left justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005174done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175
5176static PyObject *
5177unicode_ljust(PyUnicodeObject *self, PyObject *args)
5178{
5179 int width;
5180 if (!PyArg_ParseTuple(args, "i:ljust", &width))
5181 return NULL;
5182
Tim Peters7a29bd52001-09-12 03:03:31 +00005183 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 Py_INCREF(self);
5185 return (PyObject*) self;
5186 }
5187
5188 return (PyObject*) pad(self, 0, width - self->length, ' ');
5189}
5190
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005191PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192"S.lower() -> unicode\n\
5193\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005194Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195
5196static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005197unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199 return fixup(self, fixlower);
5200}
5201
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005202#define LEFTSTRIP 0
5203#define RIGHTSTRIP 1
5204#define BOTHSTRIP 2
5205
5206/* Arrays indexed by above */
5207static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5208
5209#define STRIPNAME(i) (stripformat[i]+3)
5210
5211static const Py_UNICODE *
5212unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5213{
Tim Peters030a5ce2002-04-22 19:00:10 +00005214 size_t i;
5215 for (i = 0; i < n; ++i)
5216 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005217 return s+i;
5218 return NULL;
5219}
5220
5221/* externally visible for str.strip(unicode) */
5222PyObject *
5223_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5224{
5225 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5226 int len = PyUnicode_GET_SIZE(self);
5227 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5228 int seplen = PyUnicode_GET_SIZE(sepobj);
5229 int i, j;
5230
5231 i = 0;
5232 if (striptype != RIGHTSTRIP) {
5233 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5234 i++;
5235 }
5236 }
5237
5238 j = len;
5239 if (striptype != LEFTSTRIP) {
5240 do {
5241 j--;
5242 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5243 j++;
5244 }
5245
5246 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5247 Py_INCREF(self);
5248 return (PyObject*)self;
5249 }
5250 else
5251 return PyUnicode_FromUnicode(s+i, j-i);
5252}
5253
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254
5255static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005256do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005258 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5259 int len = PyUnicode_GET_SIZE(self), i, j;
5260
5261 i = 0;
5262 if (striptype != RIGHTSTRIP) {
5263 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5264 i++;
5265 }
5266 }
5267
5268 j = len;
5269 if (striptype != LEFTSTRIP) {
5270 do {
5271 j--;
5272 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5273 j++;
5274 }
5275
5276 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5277 Py_INCREF(self);
5278 return (PyObject*)self;
5279 }
5280 else
5281 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282}
5283
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005284
5285static PyObject *
5286do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5287{
5288 PyObject *sep = NULL;
5289
5290 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5291 return NULL;
5292
5293 if (sep != NULL && sep != Py_None) {
5294 if (PyUnicode_Check(sep))
5295 return _PyUnicode_XStrip(self, striptype, sep);
5296 else if (PyString_Check(sep)) {
5297 PyObject *res;
5298 sep = PyUnicode_FromObject(sep);
5299 if (sep==NULL)
5300 return NULL;
5301 res = _PyUnicode_XStrip(self, striptype, sep);
5302 Py_DECREF(sep);
5303 return res;
5304 }
5305 else {
5306 PyErr_Format(PyExc_TypeError,
5307 "%s arg must be None, unicode or str",
5308 STRIPNAME(striptype));
5309 return NULL;
5310 }
5311 }
5312
5313 return do_strip(self, striptype);
5314}
5315
5316
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005317PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005318"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005319\n\
5320Return a copy of the string S with leading and trailing\n\
5321whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005322If chars is given and not None, remove characters in chars instead.\n\
5323If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005324
5325static PyObject *
5326unicode_strip(PyUnicodeObject *self, PyObject *args)
5327{
5328 if (PyTuple_GET_SIZE(args) == 0)
5329 return do_strip(self, BOTHSTRIP); /* Common case */
5330 else
5331 return do_argstrip(self, BOTHSTRIP, args);
5332}
5333
5334
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005335PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005336"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005337\n\
5338Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005339If chars is given and not None, remove characters in chars instead.\n\
5340If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005341
5342static PyObject *
5343unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5344{
5345 if (PyTuple_GET_SIZE(args) == 0)
5346 return do_strip(self, LEFTSTRIP); /* Common case */
5347 else
5348 return do_argstrip(self, LEFTSTRIP, args);
5349}
5350
5351
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005352PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005353"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005354\n\
5355Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005356If chars is given and not None, remove characters in chars instead.\n\
5357If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005358
5359static PyObject *
5360unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5361{
5362 if (PyTuple_GET_SIZE(args) == 0)
5363 return do_strip(self, RIGHTSTRIP); /* Common case */
5364 else
5365 return do_argstrip(self, RIGHTSTRIP, args);
5366}
5367
5368
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369static PyObject*
5370unicode_repeat(PyUnicodeObject *str, int len)
5371{
5372 PyUnicodeObject *u;
5373 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005374 int nchars;
5375 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376
5377 if (len < 0)
5378 len = 0;
5379
Tim Peters7a29bd52001-09-12 03:03:31 +00005380 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 /* no repeat, return original string */
5382 Py_INCREF(str);
5383 return (PyObject*) str;
5384 }
Tim Peters8f422462000-09-09 06:13:41 +00005385
5386 /* ensure # of chars needed doesn't overflow int and # of bytes
5387 * needed doesn't overflow size_t
5388 */
5389 nchars = len * str->length;
5390 if (len && nchars / len != str->length) {
5391 PyErr_SetString(PyExc_OverflowError,
5392 "repeated string is too long");
5393 return NULL;
5394 }
5395 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5396 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5397 PyErr_SetString(PyExc_OverflowError,
5398 "repeated string is too long");
5399 return NULL;
5400 }
5401 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 if (!u)
5403 return NULL;
5404
5405 p = u->str;
5406
5407 while (len-- > 0) {
5408 Py_UNICODE_COPY(p, str->str, str->length);
5409 p += str->length;
5410 }
5411
5412 return (PyObject*) u;
5413}
5414
5415PyObject *PyUnicode_Replace(PyObject *obj,
5416 PyObject *subobj,
5417 PyObject *replobj,
5418 int maxcount)
5419{
5420 PyObject *self;
5421 PyObject *str1;
5422 PyObject *str2;
5423 PyObject *result;
5424
5425 self = PyUnicode_FromObject(obj);
5426 if (self == NULL)
5427 return NULL;
5428 str1 = PyUnicode_FromObject(subobj);
5429 if (str1 == NULL) {
5430 Py_DECREF(self);
5431 return NULL;
5432 }
5433 str2 = PyUnicode_FromObject(replobj);
5434 if (str2 == NULL) {
5435 Py_DECREF(self);
5436 Py_DECREF(str1);
5437 return NULL;
5438 }
Tim Petersced69f82003-09-16 20:30:58 +00005439 result = replace((PyUnicodeObject *)self,
5440 (PyUnicodeObject *)str1,
5441 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 maxcount);
5443 Py_DECREF(self);
5444 Py_DECREF(str1);
5445 Py_DECREF(str2);
5446 return result;
5447}
5448
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005449PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450"S.replace (old, new[, maxsplit]) -> unicode\n\
5451\n\
5452Return a copy of S with all occurrences of substring\n\
5453old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005454given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455
5456static PyObject*
5457unicode_replace(PyUnicodeObject *self, PyObject *args)
5458{
5459 PyUnicodeObject *str1;
5460 PyUnicodeObject *str2;
5461 int maxcount = -1;
5462 PyObject *result;
5463
5464 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5465 return NULL;
5466 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5467 if (str1 == NULL)
5468 return NULL;
5469 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005470 if (str2 == NULL) {
5471 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005473 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474
5475 result = replace(self, str1, str2, maxcount);
5476
5477 Py_DECREF(str1);
5478 Py_DECREF(str2);
5479 return result;
5480}
5481
5482static
5483PyObject *unicode_repr(PyObject *unicode)
5484{
5485 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5486 PyUnicode_GET_SIZE(unicode),
5487 1);
5488}
5489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005490PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491"S.rfind(sub [,start [,end]]) -> int\n\
5492\n\
5493Return the highest index in S where substring sub is found,\n\
5494such that sub is contained within s[start,end]. Optional\n\
5495arguments start and end are interpreted as in slice notation.\n\
5496\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005497Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498
5499static PyObject *
5500unicode_rfind(PyUnicodeObject *self, PyObject *args)
5501{
5502 PyUnicodeObject *substring;
5503 int start = 0;
5504 int end = INT_MAX;
5505 PyObject *result;
5506
Guido van Rossumb8872e62000-05-09 14:14:27 +00005507 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5508 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509 return NULL;
5510 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5511 (PyObject *)substring);
5512 if (substring == NULL)
5513 return NULL;
5514
5515 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5516
5517 Py_DECREF(substring);
5518 return result;
5519}
5520
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005521PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522"S.rindex(sub [,start [,end]]) -> int\n\
5523\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005524Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525
5526static PyObject *
5527unicode_rindex(PyUnicodeObject *self, PyObject *args)
5528{
5529 int result;
5530 PyUnicodeObject *substring;
5531 int start = 0;
5532 int end = INT_MAX;
5533
Guido van Rossumb8872e62000-05-09 14:14:27 +00005534 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5535 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536 return NULL;
5537 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5538 (PyObject *)substring);
5539 if (substring == NULL)
5540 return NULL;
5541
5542 result = findstring(self, substring, start, end, -1);
5543
5544 Py_DECREF(substring);
5545 if (result < 0) {
5546 PyErr_SetString(PyExc_ValueError, "substring not found");
5547 return NULL;
5548 }
5549 return PyInt_FromLong(result);
5550}
5551
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005552PyDoc_STRVAR(rjust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553"S.rjust(width) -> unicode\n\
5554\n\
5555Return S right justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005556done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557
5558static PyObject *
5559unicode_rjust(PyUnicodeObject *self, PyObject *args)
5560{
5561 int width;
5562 if (!PyArg_ParseTuple(args, "i:rjust", &width))
5563 return NULL;
5564
Tim Peters7a29bd52001-09-12 03:03:31 +00005565 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 Py_INCREF(self);
5567 return (PyObject*) self;
5568 }
5569
5570 return (PyObject*) pad(self, width - self->length, 0, ' ');
5571}
5572
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573static PyObject*
5574unicode_slice(PyUnicodeObject *self, int start, int end)
5575{
5576 /* standard clamping */
5577 if (start < 0)
5578 start = 0;
5579 if (end < 0)
5580 end = 0;
5581 if (end > self->length)
5582 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005583 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584 /* full slice, return original string */
5585 Py_INCREF(self);
5586 return (PyObject*) self;
5587 }
5588 if (start > end)
5589 start = end;
5590 /* copy slice */
5591 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5592 end - start);
5593}
5594
5595PyObject *PyUnicode_Split(PyObject *s,
5596 PyObject *sep,
5597 int maxsplit)
5598{
5599 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005600
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601 s = PyUnicode_FromObject(s);
5602 if (s == NULL)
5603 return NULL;
5604 if (sep != NULL) {
5605 sep = PyUnicode_FromObject(sep);
5606 if (sep == NULL) {
5607 Py_DECREF(s);
5608 return NULL;
5609 }
5610 }
5611
5612 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5613
5614 Py_DECREF(s);
5615 Py_XDECREF(sep);
5616 return result;
5617}
5618
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005619PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620"S.split([sep [,maxsplit]]) -> list of strings\n\
5621\n\
5622Return a list of the words in S, using sep as the\n\
5623delimiter string. If maxsplit is given, at most maxsplit\n\
5624splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005625is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626
5627static PyObject*
5628unicode_split(PyUnicodeObject *self, PyObject *args)
5629{
5630 PyObject *substring = Py_None;
5631 int maxcount = -1;
5632
5633 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5634 return NULL;
5635
5636 if (substring == Py_None)
5637 return split(self, NULL, maxcount);
5638 else if (PyUnicode_Check(substring))
5639 return split(self, (PyUnicodeObject *)substring, maxcount);
5640 else
5641 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5642}
5643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005644PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00005645"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646\n\
5647Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00005648Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005649is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650
5651static PyObject*
5652unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5653{
Guido van Rossum86662912000-04-11 15:38:46 +00005654 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655
Guido van Rossum86662912000-04-11 15:38:46 +00005656 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657 return NULL;
5658
Guido van Rossum86662912000-04-11 15:38:46 +00005659 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660}
5661
5662static
5663PyObject *unicode_str(PyUnicodeObject *self)
5664{
Fred Drakee4315f52000-05-09 19:53:39 +00005665 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666}
5667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005668PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669"S.swapcase() -> unicode\n\
5670\n\
5671Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005672and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673
5674static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005675unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 return fixup(self, fixswapcase);
5678}
5679
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005680PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681"S.translate(table) -> unicode\n\
5682\n\
5683Return a copy of the string S, where all characters have been mapped\n\
5684through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00005685Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5686Unmapped characters are left untouched. Characters mapped to None\n\
5687are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688
5689static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005690unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691{
Tim Petersced69f82003-09-16 20:30:58 +00005692 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00005694 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 "ignore");
5696}
5697
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005698PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699"S.upper() -> unicode\n\
5700\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005701Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702
5703static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005704unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 return fixup(self, fixupper);
5707}
5708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005709PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710"S.zfill(width) -> unicode\n\
5711\n\
5712Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005713of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714
5715static PyObject *
5716unicode_zfill(PyUnicodeObject *self, PyObject *args)
5717{
5718 int fill;
5719 PyUnicodeObject *u;
5720
5721 int width;
5722 if (!PyArg_ParseTuple(args, "i:zfill", &width))
5723 return NULL;
5724
5725 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00005726 if (PyUnicode_CheckExact(self)) {
5727 Py_INCREF(self);
5728 return (PyObject*) self;
5729 }
5730 else
5731 return PyUnicode_FromUnicode(
5732 PyUnicode_AS_UNICODE(self),
5733 PyUnicode_GET_SIZE(self)
5734 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735 }
5736
5737 fill = width - self->length;
5738
5739 u = pad(self, fill, 0, '0');
5740
Walter Dörwald068325e2002-04-15 13:36:47 +00005741 if (u == NULL)
5742 return NULL;
5743
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744 if (u->str[fill] == '+' || u->str[fill] == '-') {
5745 /* move sign to beginning of string */
5746 u->str[0] = u->str[fill];
5747 u->str[fill] = '0';
5748 }
5749
5750 return (PyObject*) u;
5751}
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752
5753#if 0
5754static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005755unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 return PyInt_FromLong(unicode_freelist_size);
5758}
5759#endif
5760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005761PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005762"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00005764Return True if S starts with the specified prefix, False otherwise.\n\
5765With optional start, test S beginning at that position.\n\
5766With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767
5768static PyObject *
5769unicode_startswith(PyUnicodeObject *self,
5770 PyObject *args)
5771{
5772 PyUnicodeObject *substring;
5773 int start = 0;
5774 int end = INT_MAX;
5775 PyObject *result;
5776
Guido van Rossumb8872e62000-05-09 14:14:27 +00005777 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5778 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 return NULL;
5780 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5781 (PyObject *)substring);
5782 if (substring == NULL)
5783 return NULL;
5784
Guido van Rossum77f6a652002-04-03 22:41:51 +00005785 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786
5787 Py_DECREF(substring);
5788 return result;
5789}
5790
5791
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005792PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005793"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00005795Return True if S ends with the specified suffix, False otherwise.\n\
5796With optional start, test S beginning at that position.\n\
5797With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798
5799static PyObject *
5800unicode_endswith(PyUnicodeObject *self,
5801 PyObject *args)
5802{
5803 PyUnicodeObject *substring;
5804 int start = 0;
5805 int end = INT_MAX;
5806 PyObject *result;
5807
Guido van Rossumb8872e62000-05-09 14:14:27 +00005808 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5809 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810 return NULL;
5811 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5812 (PyObject *)substring);
5813 if (substring == NULL)
5814 return NULL;
5815
Guido van Rossum77f6a652002-04-03 22:41:51 +00005816 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817
5818 Py_DECREF(substring);
5819 return result;
5820}
5821
5822
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005823
5824static PyObject *
5825unicode_getnewargs(PyUnicodeObject *v)
5826{
5827 return Py_BuildValue("(u#)", v->str, v->length);
5828}
5829
5830
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831static PyMethodDef unicode_methods[] = {
5832
5833 /* Order is according to common usage: often used methods should
5834 appear first, since lookup is done sequentially. */
5835
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005836 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5837 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5838 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5839 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5840 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5841 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5842 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5843 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5844 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5845 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5846 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5847 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5848 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005849 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005850/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5851 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5852 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5853 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005854 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005855 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005856 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005857 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5858 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5859 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5860 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5861 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5862 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5863 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5864 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5865 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5866 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5867 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5868 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5869 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5870 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005871 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00005872#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005873 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874#endif
5875
5876#if 0
5877 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005878 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879#endif
5880
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005881 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 {NULL, NULL}
5883};
5884
Neil Schemenauerce30bc92002-11-18 16:10:18 +00005885static PyObject *
5886unicode_mod(PyObject *v, PyObject *w)
5887{
5888 if (!PyUnicode_Check(v)) {
5889 Py_INCREF(Py_NotImplemented);
5890 return Py_NotImplemented;
5891 }
5892 return PyUnicode_Format(v, w);
5893}
5894
5895static PyNumberMethods unicode_as_number = {
5896 0, /*nb_add*/
5897 0, /*nb_subtract*/
5898 0, /*nb_multiply*/
5899 0, /*nb_divide*/
5900 unicode_mod, /*nb_remainder*/
5901};
5902
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903static PySequenceMethods unicode_as_sequence = {
5904 (inquiry) unicode_length, /* sq_length */
5905 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5906 (intargfunc) unicode_repeat, /* sq_repeat */
5907 (intargfunc) unicode_getitem, /* sq_item */
5908 (intintargfunc) unicode_slice, /* sq_slice */
5909 0, /* sq_ass_item */
5910 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005911 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912};
5913
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005914static PyObject*
5915unicode_subscript(PyUnicodeObject* self, PyObject* item)
5916{
5917 if (PyInt_Check(item)) {
5918 long i = PyInt_AS_LONG(item);
5919 if (i < 0)
5920 i += PyString_GET_SIZE(self);
5921 return unicode_getitem(self, i);
5922 } else if (PyLong_Check(item)) {
5923 long i = PyLong_AsLong(item);
5924 if (i == -1 && PyErr_Occurred())
5925 return NULL;
5926 if (i < 0)
5927 i += PyString_GET_SIZE(self);
5928 return unicode_getitem(self, i);
5929 } else if (PySlice_Check(item)) {
5930 int start, stop, step, slicelength, cur, i;
5931 Py_UNICODE* source_buf;
5932 Py_UNICODE* result_buf;
5933 PyObject* result;
5934
5935 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5936 &start, &stop, &step, &slicelength) < 0) {
5937 return NULL;
5938 }
5939
5940 if (slicelength <= 0) {
5941 return PyUnicode_FromUnicode(NULL, 0);
5942 } else {
5943 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5944 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5945
5946 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5947 result_buf[i] = source_buf[cur];
5948 }
Tim Petersced69f82003-09-16 20:30:58 +00005949
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005950 result = PyUnicode_FromUnicode(result_buf, slicelength);
5951 PyMem_FREE(result_buf);
5952 return result;
5953 }
5954 } else {
5955 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5956 return NULL;
5957 }
5958}
5959
5960static PyMappingMethods unicode_as_mapping = {
5961 (inquiry)unicode_length, /* mp_length */
5962 (binaryfunc)unicode_subscript, /* mp_subscript */
5963 (objobjargproc)0, /* mp_ass_subscript */
5964};
5965
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966static int
5967unicode_buffer_getreadbuf(PyUnicodeObject *self,
5968 int index,
5969 const void **ptr)
5970{
5971 if (index != 0) {
5972 PyErr_SetString(PyExc_SystemError,
5973 "accessing non-existent unicode segment");
5974 return -1;
5975 }
5976 *ptr = (void *) self->str;
5977 return PyUnicode_GET_DATA_SIZE(self);
5978}
5979
5980static int
5981unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5982 const void **ptr)
5983{
5984 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00005985 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986 return -1;
5987}
5988
5989static int
5990unicode_buffer_getsegcount(PyUnicodeObject *self,
5991 int *lenp)
5992{
5993 if (lenp)
5994 *lenp = PyUnicode_GET_DATA_SIZE(self);
5995 return 1;
5996}
5997
5998static int
5999unicode_buffer_getcharbuf(PyUnicodeObject *self,
6000 int index,
6001 const void **ptr)
6002{
6003 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006004
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 if (index != 0) {
6006 PyErr_SetString(PyExc_SystemError,
6007 "accessing non-existent unicode segment");
6008 return -1;
6009 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006010 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 if (str == NULL)
6012 return -1;
6013 *ptr = (void *) PyString_AS_STRING(str);
6014 return PyString_GET_SIZE(str);
6015}
6016
6017/* Helpers for PyUnicode_Format() */
6018
6019static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006020getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021{
6022 int argidx = *p_argidx;
6023 if (argidx < arglen) {
6024 (*p_argidx)++;
6025 if (arglen < 0)
6026 return args;
6027 else
6028 return PyTuple_GetItem(args, argidx);
6029 }
6030 PyErr_SetString(PyExc_TypeError,
6031 "not enough arguments for format string");
6032 return NULL;
6033}
6034
6035#define F_LJUST (1<<0)
6036#define F_SIGN (1<<1)
6037#define F_BLANK (1<<2)
6038#define F_ALT (1<<3)
6039#define F_ZERO (1<<4)
6040
6041static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043{
6044 register int i;
6045 int len;
6046 va_list va;
6047 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049
6050 /* First, format the string as char array, then expand to Py_UNICODE
6051 array. */
6052 charbuffer = (char *)buffer;
6053 len = vsprintf(charbuffer, format, va);
6054 for (i = len - 1; i >= 0; i--)
6055 buffer[i] = (Py_UNICODE) charbuffer[i];
6056
6057 va_end(va);
6058 return len;
6059}
6060
Guido van Rossum078151d2002-08-11 04:24:12 +00006061/* XXX To save some code duplication, formatfloat/long/int could have been
6062 shared with stringobject.c, converting from 8-bit to Unicode after the
6063 formatting is done. */
6064
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065static int
6066formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006067 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 int flags,
6069 int prec,
6070 int type,
6071 PyObject *v)
6072{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006073 /* fmt = '%#.' + `prec` + `type`
6074 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 char fmt[20];
6076 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006077
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 x = PyFloat_AsDouble(v);
6079 if (x == -1.0 && PyErr_Occurred())
6080 return -1;
6081 if (prec < 0)
6082 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6084 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006085 /* Worst case length calc to ensure no buffer overrun:
6086
6087 'g' formats:
6088 fmt = %#.<prec>g
6089 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6090 for any double rep.)
6091 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6092
6093 'f' formats:
6094 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6095 len = 1 + 50 + 1 + prec = 52 + prec
6096
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006097 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006098 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006099
6100 */
6101 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6102 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006103 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006104 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006105 return -1;
6106 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006107 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6108 (flags&F_ALT) ? "#" : "",
6109 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 return usprintf(buf, fmt, x);
6111}
6112
Tim Peters38fd5b62000-09-21 05:43:11 +00006113static PyObject*
6114formatlong(PyObject *val, int flags, int prec, int type)
6115{
6116 char *buf;
6117 int i, len;
6118 PyObject *str; /* temporary string object. */
6119 PyUnicodeObject *result;
6120
6121 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6122 if (!str)
6123 return NULL;
6124 result = _PyUnicode_New(len);
6125 for (i = 0; i < len; i++)
6126 result->str[i] = buf[i];
6127 result->str[len] = 0;
6128 Py_DECREF(str);
6129 return (PyObject*)result;
6130}
6131
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132static int
6133formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006134 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135 int flags,
6136 int prec,
6137 int type,
6138 PyObject *v)
6139{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006140 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006141 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6142 * + 1 + 1
6143 * = 24
6144 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006145 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 long x;
6147
6148 x = PyInt_AsLong(v);
6149 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006150 return -1;
Guido van Rossum078151d2002-08-11 04:24:12 +00006151 if (x < 0 && type != 'd' && type != 'i') {
Guido van Rossum54df53a2002-08-14 18:38:27 +00006152 if (PyErr_Warn(PyExc_FutureWarning,
Guido van Rossum078151d2002-08-11 04:24:12 +00006153 "%u/%o/%x/%X of negative int will return "
6154 "a signed string in Python 2.4 and up") < 0)
6155 return -1;
6156 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006158 prec = 1;
6159
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006160 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006161 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
6162 */
6163 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006164 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006165 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006166 return -1;
6167 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006168
6169 if ((flags & F_ALT) &&
6170 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006171 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006172 * of issues that cause pain:
6173 * - when 0 is being converted, the C standard leaves off
6174 * the '0x' or '0X', which is inconsistent with other
6175 * %#x/%#X conversions and inconsistent with Python's
6176 * hex() function
6177 * - there are platforms that violate the standard and
6178 * convert 0 with the '0x' or '0X'
6179 * (Metrowerks, Compaq Tru64)
6180 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006181 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006182 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006183 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006184 * We can achieve the desired consistency by inserting our
6185 * own '0x' or '0X' prefix, and substituting %x/%X in place
6186 * of %#x/%#X.
6187 *
6188 * Note that this is the same approach as used in
6189 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006190 */
Tim Petersced69f82003-09-16 20:30:58 +00006191 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006192 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006193 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006194 else {
6195 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
Tim Petersced69f82003-09-16 20:30:58 +00006196 (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006197 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006198 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 return usprintf(buf, fmt, x);
6200}
6201
6202static int
6203formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006204 size_t buflen,
6205 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006207 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006208 if (PyUnicode_Check(v)) {
6209 if (PyUnicode_GET_SIZE(v) != 1)
6210 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006212 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006214 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006215 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006216 goto onError;
6217 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6218 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219
6220 else {
6221 /* Integer input truncated to a character */
6222 long x;
6223 x = PyInt_AsLong(v);
6224 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006225 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006226#ifdef Py_UNICODE_WIDE
6227 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006228 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006229 "%c arg not in range(0x110000) "
6230 "(wide Python build)");
6231 return -1;
6232 }
6233#else
6234 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006235 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006236 "%c arg not in range(0x10000) "
6237 "(narrow Python build)");
6238 return -1;
6239 }
6240#endif
6241 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242 }
6243 buf[1] = '\0';
6244 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006245
6246 onError:
6247 PyErr_SetString(PyExc_TypeError,
6248 "%c requires int or char");
6249 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250}
6251
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006252/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6253
6254 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6255 chars are formatted. XXX This is a magic number. Each formatting
6256 routine does bounds checking to ensure no overflow, but a better
6257 solution may be to malloc a buffer of appropriate size for each
6258 format. For now, the current solution is sufficient.
6259*/
6260#define FORMATBUFLEN (size_t)120
6261
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262PyObject *PyUnicode_Format(PyObject *format,
6263 PyObject *args)
6264{
6265 Py_UNICODE *fmt, *res;
6266 int fmtcnt, rescnt, reslen, arglen, argidx;
6267 int args_owned = 0;
6268 PyUnicodeObject *result = NULL;
6269 PyObject *dict = NULL;
6270 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006271
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 if (format == NULL || args == NULL) {
6273 PyErr_BadInternalCall();
6274 return NULL;
6275 }
6276 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006277 if (uformat == NULL)
6278 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279 fmt = PyUnicode_AS_UNICODE(uformat);
6280 fmtcnt = PyUnicode_GET_SIZE(uformat);
6281
6282 reslen = rescnt = fmtcnt + 100;
6283 result = _PyUnicode_New(reslen);
6284 if (result == NULL)
6285 goto onError;
6286 res = PyUnicode_AS_UNICODE(result);
6287
6288 if (PyTuple_Check(args)) {
6289 arglen = PyTuple_Size(args);
6290 argidx = 0;
6291 }
6292 else {
6293 arglen = -1;
6294 argidx = -2;
6295 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006296 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6297 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 dict = args;
6299
6300 while (--fmtcnt >= 0) {
6301 if (*fmt != '%') {
6302 if (--rescnt < 0) {
6303 rescnt = fmtcnt + 100;
6304 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006305 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 return NULL;
6307 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6308 --rescnt;
6309 }
6310 *res++ = *fmt++;
6311 }
6312 else {
6313 /* Got a format specifier */
6314 int flags = 0;
6315 int width = -1;
6316 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317 Py_UNICODE c = '\0';
6318 Py_UNICODE fill;
6319 PyObject *v = NULL;
6320 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006321 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322 Py_UNICODE sign;
6323 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006324 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006325
6326 fmt++;
6327 if (*fmt == '(') {
6328 Py_UNICODE *keystart;
6329 int keylen;
6330 PyObject *key;
6331 int pcount = 1;
6332
6333 if (dict == NULL) {
6334 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006335 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336 goto onError;
6337 }
6338 ++fmt;
6339 --fmtcnt;
6340 keystart = fmt;
6341 /* Skip over balanced parentheses */
6342 while (pcount > 0 && --fmtcnt >= 0) {
6343 if (*fmt == ')')
6344 --pcount;
6345 else if (*fmt == '(')
6346 ++pcount;
6347 fmt++;
6348 }
6349 keylen = fmt - keystart - 1;
6350 if (fmtcnt < 0 || pcount > 0) {
6351 PyErr_SetString(PyExc_ValueError,
6352 "incomplete format key");
6353 goto onError;
6354 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006355#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006356 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 then looked up since Python uses strings to hold
6358 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006359 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360 key = PyUnicode_EncodeUTF8(keystart,
6361 keylen,
6362 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006363#else
6364 key = PyUnicode_FromUnicode(keystart, keylen);
6365#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366 if (key == NULL)
6367 goto onError;
6368 if (args_owned) {
6369 Py_DECREF(args);
6370 args_owned = 0;
6371 }
6372 args = PyObject_GetItem(dict, key);
6373 Py_DECREF(key);
6374 if (args == NULL) {
6375 goto onError;
6376 }
6377 args_owned = 1;
6378 arglen = -1;
6379 argidx = -2;
6380 }
6381 while (--fmtcnt >= 0) {
6382 switch (c = *fmt++) {
6383 case '-': flags |= F_LJUST; continue;
6384 case '+': flags |= F_SIGN; continue;
6385 case ' ': flags |= F_BLANK; continue;
6386 case '#': flags |= F_ALT; continue;
6387 case '0': flags |= F_ZERO; continue;
6388 }
6389 break;
6390 }
6391 if (c == '*') {
6392 v = getnextarg(args, arglen, &argidx);
6393 if (v == NULL)
6394 goto onError;
6395 if (!PyInt_Check(v)) {
6396 PyErr_SetString(PyExc_TypeError,
6397 "* wants int");
6398 goto onError;
6399 }
6400 width = PyInt_AsLong(v);
6401 if (width < 0) {
6402 flags |= F_LJUST;
6403 width = -width;
6404 }
6405 if (--fmtcnt >= 0)
6406 c = *fmt++;
6407 }
6408 else if (c >= '0' && c <= '9') {
6409 width = c - '0';
6410 while (--fmtcnt >= 0) {
6411 c = *fmt++;
6412 if (c < '0' || c > '9')
6413 break;
6414 if ((width*10) / 10 != width) {
6415 PyErr_SetString(PyExc_ValueError,
6416 "width too big");
6417 goto onError;
6418 }
6419 width = width*10 + (c - '0');
6420 }
6421 }
6422 if (c == '.') {
6423 prec = 0;
6424 if (--fmtcnt >= 0)
6425 c = *fmt++;
6426 if (c == '*') {
6427 v = getnextarg(args, arglen, &argidx);
6428 if (v == NULL)
6429 goto onError;
6430 if (!PyInt_Check(v)) {
6431 PyErr_SetString(PyExc_TypeError,
6432 "* wants int");
6433 goto onError;
6434 }
6435 prec = PyInt_AsLong(v);
6436 if (prec < 0)
6437 prec = 0;
6438 if (--fmtcnt >= 0)
6439 c = *fmt++;
6440 }
6441 else if (c >= '0' && c <= '9') {
6442 prec = c - '0';
6443 while (--fmtcnt >= 0) {
6444 c = Py_CHARMASK(*fmt++);
6445 if (c < '0' || c > '9')
6446 break;
6447 if ((prec*10) / 10 != prec) {
6448 PyErr_SetString(PyExc_ValueError,
6449 "prec too big");
6450 goto onError;
6451 }
6452 prec = prec*10 + (c - '0');
6453 }
6454 }
6455 } /* prec */
6456 if (fmtcnt >= 0) {
6457 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458 if (--fmtcnt >= 0)
6459 c = *fmt++;
6460 }
6461 }
6462 if (fmtcnt < 0) {
6463 PyErr_SetString(PyExc_ValueError,
6464 "incomplete format");
6465 goto onError;
6466 }
6467 if (c != '%') {
6468 v = getnextarg(args, arglen, &argidx);
6469 if (v == NULL)
6470 goto onError;
6471 }
6472 sign = 0;
6473 fill = ' ';
6474 switch (c) {
6475
6476 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006477 pbuf = formatbuf;
6478 /* presume that buffer length is at least 1 */
6479 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 len = 1;
6481 break;
6482
6483 case 's':
6484 case 'r':
6485 if (PyUnicode_Check(v) && c == 's') {
6486 temp = v;
6487 Py_INCREF(temp);
6488 }
6489 else {
6490 PyObject *unicode;
6491 if (c == 's')
6492 temp = PyObject_Str(v);
6493 else
6494 temp = PyObject_Repr(v);
6495 if (temp == NULL)
6496 goto onError;
6497 if (!PyString_Check(temp)) {
6498 /* XXX Note: this should never happen, since
6499 PyObject_Repr() and PyObject_Str() assure
6500 this */
6501 Py_DECREF(temp);
6502 PyErr_SetString(PyExc_TypeError,
6503 "%s argument has non-string str()");
6504 goto onError;
6505 }
Fred Drakee4315f52000-05-09 19:53:39 +00006506 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006508 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 "strict");
6510 Py_DECREF(temp);
6511 temp = unicode;
6512 if (temp == NULL)
6513 goto onError;
6514 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006515 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 len = PyUnicode_GET_SIZE(temp);
6517 if (prec >= 0 && len > prec)
6518 len = prec;
6519 break;
6520
6521 case 'i':
6522 case 'd':
6523 case 'u':
6524 case 'o':
6525 case 'x':
6526 case 'X':
6527 if (c == 'i')
6528 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006529 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006530 temp = formatlong(v, flags, prec, c);
6531 if (!temp)
6532 goto onError;
6533 pbuf = PyUnicode_AS_UNICODE(temp);
6534 len = PyUnicode_GET_SIZE(temp);
6535 /* unbounded ints can always produce
6536 a sign character! */
6537 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006539 else {
6540 pbuf = formatbuf;
6541 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6542 flags, prec, c, v);
6543 if (len < 0)
6544 goto onError;
6545 /* only d conversion is signed */
6546 sign = c == 'd';
6547 }
6548 if (flags & F_ZERO)
6549 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 break;
6551
6552 case 'e':
6553 case 'E':
6554 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006555 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556 case 'g':
6557 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006558 if (c == 'F')
6559 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006560 pbuf = formatbuf;
6561 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6562 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563 if (len < 0)
6564 goto onError;
6565 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006566 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 fill = '0';
6568 break;
6569
6570 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006571 pbuf = formatbuf;
6572 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573 if (len < 0)
6574 goto onError;
6575 break;
6576
6577 default:
6578 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006579 "unsupported format character '%c' (0x%x) "
6580 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00006581 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006582 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006583 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 goto onError;
6585 }
6586 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006587 if (*pbuf == '-' || *pbuf == '+') {
6588 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 len--;
6590 }
6591 else if (flags & F_SIGN)
6592 sign = '+';
6593 else if (flags & F_BLANK)
6594 sign = ' ';
6595 else
6596 sign = 0;
6597 }
6598 if (width < len)
6599 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006600 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601 reslen -= rescnt;
6602 rescnt = width + fmtcnt + 100;
6603 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006604 if (reslen < 0) {
6605 Py_DECREF(result);
6606 return PyErr_NoMemory();
6607 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006608 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 return NULL;
6610 res = PyUnicode_AS_UNICODE(result)
6611 + reslen - rescnt;
6612 }
6613 if (sign) {
6614 if (fill != ' ')
6615 *res++ = sign;
6616 rescnt--;
6617 if (width > len)
6618 width--;
6619 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006620 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6621 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006622 assert(pbuf[1] == c);
6623 if (fill != ' ') {
6624 *res++ = *pbuf++;
6625 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00006626 }
Tim Petersfff53252001-04-12 18:38:48 +00006627 rescnt -= 2;
6628 width -= 2;
6629 if (width < 0)
6630 width = 0;
6631 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00006632 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 if (width > len && !(flags & F_LJUST)) {
6634 do {
6635 --rescnt;
6636 *res++ = fill;
6637 } while (--width > len);
6638 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006639 if (fill == ' ') {
6640 if (sign)
6641 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00006642 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006643 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006644 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00006645 *res++ = *pbuf++;
6646 *res++ = *pbuf++;
6647 }
6648 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006649 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650 res += len;
6651 rescnt -= len;
6652 while (--width >= len) {
6653 --rescnt;
6654 *res++ = ' ';
6655 }
6656 if (dict && (argidx < arglen) && c != '%') {
6657 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006658 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659 goto onError;
6660 }
6661 Py_XDECREF(temp);
6662 } /* '%' */
6663 } /* until end */
6664 if (argidx < arglen && !dict) {
6665 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006666 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 goto onError;
6668 }
6669
6670 if (args_owned) {
6671 Py_DECREF(args);
6672 }
6673 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00006674 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006675 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 return (PyObject *)result;
6677
6678 onError:
6679 Py_XDECREF(result);
6680 Py_DECREF(uformat);
6681 if (args_owned) {
6682 Py_DECREF(args);
6683 }
6684 return NULL;
6685}
6686
6687static PyBufferProcs unicode_as_buffer = {
6688 (getreadbufferproc) unicode_buffer_getreadbuf,
6689 (getwritebufferproc) unicode_buffer_getwritebuf,
6690 (getsegcountproc) unicode_buffer_getsegcount,
6691 (getcharbufferproc) unicode_buffer_getcharbuf,
6692};
6693
Jeremy Hylton938ace62002-07-17 16:30:39 +00006694static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00006695unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6696
Tim Peters6d6c1a32001-08-02 04:15:00 +00006697static PyObject *
6698unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6699{
6700 PyObject *x = NULL;
6701 static char *kwlist[] = {"string", "encoding", "errors", 0};
6702 char *encoding = NULL;
6703 char *errors = NULL;
6704
Guido van Rossume023fe02001-08-30 03:12:59 +00006705 if (type != &PyUnicode_Type)
6706 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00006707 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6708 kwlist, &x, &encoding, &errors))
6709 return NULL;
6710 if (x == NULL)
6711 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00006712 if (encoding == NULL && errors == NULL)
6713 return PyObject_Unicode(x);
6714 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00006715 return PyUnicode_FromEncodedObject(x, encoding, errors);
6716}
6717
Guido van Rossume023fe02001-08-30 03:12:59 +00006718static PyObject *
6719unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6720{
Tim Petersaf90b3e2001-09-12 05:18:58 +00006721 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006722 int n;
6723
6724 assert(PyType_IsSubtype(type, &PyUnicode_Type));
6725 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6726 if (tmp == NULL)
6727 return NULL;
6728 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00006729 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006730 if (pnew == NULL) {
6731 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00006732 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00006733 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006734 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6735 if (pnew->str == NULL) {
6736 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006737 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006738 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00006739 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00006740 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006741 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6742 pnew->length = n;
6743 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00006744 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00006745 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006746}
6747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006748PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00006749"unicode(string [, encoding[, errors]]) -> object\n\
6750\n\
6751Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00006752encoding defaults to the current default string encoding.\n\
6753errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00006754
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755PyTypeObject PyUnicode_Type = {
6756 PyObject_HEAD_INIT(&PyType_Type)
6757 0, /* ob_size */
6758 "unicode", /* tp_name */
6759 sizeof(PyUnicodeObject), /* tp_size */
6760 0, /* tp_itemsize */
6761 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00006762 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006764 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765 0, /* tp_setattr */
6766 (cmpfunc) unicode_compare, /* tp_compare */
6767 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006768 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006770 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 (hashfunc) unicode_hash, /* tp_hash*/
6772 0, /* tp_call*/
6773 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006774 PyObject_GenericGetAttr, /* tp_getattro */
6775 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006777 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
6778 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006779 unicode_doc, /* tp_doc */
6780 0, /* tp_traverse */
6781 0, /* tp_clear */
6782 0, /* tp_richcompare */
6783 0, /* tp_weaklistoffset */
6784 0, /* tp_iter */
6785 0, /* tp_iternext */
6786 unicode_methods, /* tp_methods */
6787 0, /* tp_members */
6788 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00006789 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006790 0, /* tp_dict */
6791 0, /* tp_descr_get */
6792 0, /* tp_descr_set */
6793 0, /* tp_dictoffset */
6794 0, /* tp_init */
6795 0, /* tp_alloc */
6796 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006797 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798};
6799
6800/* Initialize the Unicode implementation */
6801
Thomas Wouters78890102000-07-22 19:25:51 +00006802void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006804 int i;
6805
Fred Drakee4315f52000-05-09 19:53:39 +00006806 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006807 unicode_freelist = NULL;
6808 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00006810 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006811 for (i = 0; i < 256; i++)
6812 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00006813 if (PyType_Ready(&PyUnicode_Type) < 0)
6814 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815}
6816
6817/* Finalize the Unicode implementation */
6818
6819void
Thomas Wouters78890102000-07-22 19:25:51 +00006820_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006822 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006823 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00006825 Py_XDECREF(unicode_empty);
6826 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006827
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006828 for (i = 0; i < 256; i++) {
6829 if (unicode_latin1[i]) {
6830 Py_DECREF(unicode_latin1[i]);
6831 unicode_latin1[i] = NULL;
6832 }
6833 }
6834
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006835 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836 PyUnicodeObject *v = u;
6837 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006838 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00006839 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006840 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006841 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006843 unicode_freelist = NULL;
6844 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006846
6847/*
6848Local variables:
6849c-basic-offset: 4
6850indent-tabs-mode: nil
6851End:
6852*/