blob: 03559da967170bbf2a3875981a220004f2d2af87 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
157 unicode->length = length;
158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000266int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
371 for (i = size; i >= 0; i--)
372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
387 if (size > PyUnicode_GET_SIZE(unicode))
388 size = PyUnicode_GET_SIZE(unicode);
389#ifdef HAVE_USABLE_WCHAR_T
390 memcpy(w, unicode->str, size * sizeof(wchar_t));
391#else
392 {
393 register Py_UNICODE *u;
394 register int i;
395 u = PyUnicode_AS_UNICODE(unicode);
396 for (i = size; i >= 0; i--)
397 *w++ = *u++;
398 }
399#endif
400
401 return size;
402}
403
404#endif
405
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000406PyObject *PyUnicode_FromOrdinal(int ordinal)
407{
408 Py_UNICODE s[2];
409
410#ifdef Py_UNICODE_WIDE
411 if (ordinal < 0 || ordinal > 0x10ffff) {
412 PyErr_SetString(PyExc_ValueError,
413 "unichr() arg not in range(0x110000) "
414 "(wide Python build)");
415 return NULL;
416 }
417#else
418 if (ordinal < 0 || ordinal > 0xffff) {
419 PyErr_SetString(PyExc_ValueError,
420 "unichr() arg not in range(0x10000) "
421 "(narrow Python build)");
422 return NULL;
423 }
424#endif
425
426 if (ordinal <= 0xffff) {
427 /* UCS-2 character */
428 s[0] = (Py_UNICODE) ordinal;
429 return PyUnicode_FromUnicode(s, 1);
430 }
431 else {
432#ifndef Py_UNICODE_WIDE
433 /* UCS-4 character. store as two surrogate characters */
434 ordinal -= 0x10000L;
435 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
436 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
437 return PyUnicode_FromUnicode(s, 2);
438#else
439 s[0] = (Py_UNICODE)ordinal;
440 return PyUnicode_FromUnicode(s, 1);
441#endif
442 }
443}
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445PyObject *PyUnicode_FromObject(register PyObject *obj)
446{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000447 /* XXX Perhaps we should make this API an alias of
448 PyObject_Unicode() instead ?! */
449 if (PyUnicode_CheckExact(obj)) {
450 Py_INCREF(obj);
451 return obj;
452 }
453 if (PyUnicode_Check(obj)) {
454 /* For a Unicode subtype that's not a Unicode object,
455 return a true Unicode object with the same data. */
456 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
457 PyUnicode_GET_SIZE(obj));
458 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
460}
461
462PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
463 const char *encoding,
464 const char *errors)
465{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000466 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000468 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000469
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470 if (obj == NULL) {
471 PyErr_BadInternalCall();
472 return NULL;
473 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000474
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000475#if 0
476 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000477 that no encodings is given and then redirect to
478 PyObject_Unicode() which then applies the additional logic for
479 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000480
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000481 NOTE: This API should really only be used for object which
482 represent *encoded* Unicode !
483
484 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000485 if (PyUnicode_Check(obj)) {
486 if (encoding) {
487 PyErr_SetString(PyExc_TypeError,
488 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000489 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000490 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000492 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000493#else
494 if (PyUnicode_Check(obj)) {
495 PyErr_SetString(PyExc_TypeError,
496 "decoding Unicode is not supported");
497 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000498 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000499#endif
500
501 /* Coerce object */
502 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000503 s = PyString_AS_STRING(obj);
504 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000505 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000506 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
507 /* Overwrite the error message with something more useful in
508 case of a TypeError. */
509 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000511 "coercing to Unicode: need string or buffer, "
512 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000513 obj->ob_type->tp_name);
514 goto onError;
515 }
Tim Petersced69f82003-09-16 20:30:58 +0000516
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000517 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000518 if (len == 0) {
519 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000520 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000521 }
Tim Petersced69f82003-09-16 20:30:58 +0000522 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000523 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000524
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000525 return v;
526
527 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000528 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000529}
530
531PyObject *PyUnicode_Decode(const char *s,
532 int size,
533 const char *encoding,
534 const char *errors)
535{
536 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000537
538 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000539 encoding = PyUnicode_GetDefaultEncoding();
540
541 /* Shortcuts for common default encodings */
542 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000544 else if (strcmp(encoding, "latin-1") == 0)
545 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000546#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
547 else if (strcmp(encoding, "mbcs") == 0)
548 return PyUnicode_DecodeMBCS(s, size, errors);
549#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000550 else if (strcmp(encoding, "ascii") == 0)
551 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000552
553 /* Decode via the codec registry */
554 buffer = PyBuffer_FromMemory((void *)s, size);
555 if (buffer == NULL)
556 goto onError;
557 unicode = PyCodec_Decode(buffer, encoding, errors);
558 if (unicode == NULL)
559 goto onError;
560 if (!PyUnicode_Check(unicode)) {
561 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000562 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000563 unicode->ob_type->tp_name);
564 Py_DECREF(unicode);
565 goto onError;
566 }
567 Py_DECREF(buffer);
568 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000569
Guido van Rossumd57fd912000-03-10 22:53:23 +0000570 onError:
571 Py_XDECREF(buffer);
572 return NULL;
573}
574
575PyObject *PyUnicode_Encode(const Py_UNICODE *s,
576 int size,
577 const char *encoding,
578 const char *errors)
579{
580 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000581
Guido van Rossumd57fd912000-03-10 22:53:23 +0000582 unicode = PyUnicode_FromUnicode(s, size);
583 if (unicode == NULL)
584 return NULL;
585 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
586 Py_DECREF(unicode);
587 return v;
588}
589
590PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
591 const char *encoding,
592 const char *errors)
593{
594 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596 if (!PyUnicode_Check(unicode)) {
597 PyErr_BadArgument();
598 goto onError;
599 }
Fred Drakee4315f52000-05-09 19:53:39 +0000600
Tim Petersced69f82003-09-16 20:30:58 +0000601 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000602 encoding = PyUnicode_GetDefaultEncoding();
603
604 /* Shortcuts for common default encodings */
605 if (errors == NULL) {
606 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000607 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000608 else if (strcmp(encoding, "latin-1") == 0)
609 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000610#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
611 else if (strcmp(encoding, "mbcs") == 0)
612 return PyUnicode_AsMBCSString(unicode);
613#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000614 else if (strcmp(encoding, "ascii") == 0)
615 return PyUnicode_AsASCIIString(unicode);
616 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617
618 /* Encode via the codec registry */
619 v = PyCodec_Encode(unicode, encoding, errors);
620 if (v == NULL)
621 goto onError;
622 /* XXX Should we really enforce this ? */
623 if (!PyString_Check(v)) {
624 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000625 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 v->ob_type->tp_name);
627 Py_DECREF(v);
628 goto onError;
629 }
630 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000631
Guido van Rossumd57fd912000-03-10 22:53:23 +0000632 onError:
633 return NULL;
634}
635
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000636PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
637 const char *errors)
638{
639 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
640
641 if (v)
642 return v;
643 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
644 if (v && errors == NULL)
645 ((PyUnicodeObject *)unicode)->defenc = v;
646 return v;
647}
648
Guido van Rossumd57fd912000-03-10 22:53:23 +0000649Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
650{
651 if (!PyUnicode_Check(unicode)) {
652 PyErr_BadArgument();
653 goto onError;
654 }
655 return PyUnicode_AS_UNICODE(unicode);
656
657 onError:
658 return NULL;
659}
660
661int PyUnicode_GetSize(PyObject *unicode)
662{
663 if (!PyUnicode_Check(unicode)) {
664 PyErr_BadArgument();
665 goto onError;
666 }
667 return PyUnicode_GET_SIZE(unicode);
668
669 onError:
670 return -1;
671}
672
Thomas Wouters78890102000-07-22 19:25:51 +0000673const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000674{
675 return unicode_default_encoding;
676}
677
678int PyUnicode_SetDefaultEncoding(const char *encoding)
679{
680 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000681
Fred Drakee4315f52000-05-09 19:53:39 +0000682 /* Make sure the encoding is valid. As side effect, this also
683 loads the encoding into the codec registry cache. */
684 v = _PyCodec_Lookup(encoding);
685 if (v == NULL)
686 goto onError;
687 Py_DECREF(v);
688 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000689 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000690 sizeof(unicode_default_encoding));
691 return 0;
692
693 onError:
694 return -1;
695}
696
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000697/* error handling callback helper:
698 build arguments, call the callback and check the arguments,
699 if no exception occured, copy the replacement to the output
700 and adjust various state variables.
701 return 0 on success, -1 on error
702*/
703
704static
705int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
706 const char *encoding, const char *reason,
707 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
708 PyObject **output, int *outpos, Py_UNICODE **outptr)
709{
710 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
711
712 PyObject *restuple = NULL;
713 PyObject *repunicode = NULL;
714 int outsize = PyUnicode_GET_SIZE(*output);
715 int requiredsize;
716 int newpos;
717 Py_UNICODE *repptr;
718 int repsize;
719 int res = -1;
720
721 if (*errorHandler == NULL) {
722 *errorHandler = PyCodec_LookupError(errors);
723 if (*errorHandler == NULL)
724 goto onError;
725 }
726
727 if (*exceptionObject == NULL) {
728 *exceptionObject = PyUnicodeDecodeError_Create(
729 encoding, input, insize, *startinpos, *endinpos, reason);
730 if (*exceptionObject == NULL)
731 goto onError;
732 }
733 else {
734 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
735 goto onError;
736 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
737 goto onError;
738 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
739 goto onError;
740 }
741
742 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
743 if (restuple == NULL)
744 goto onError;
745 if (!PyTuple_Check(restuple)) {
746 PyErr_Format(PyExc_TypeError, &argparse[4]);
747 goto onError;
748 }
749 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
750 goto onError;
751 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000752 newpos = insize+newpos;
753 if (newpos<0 || newpos>insize) {
754 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
755 goto onError;
756 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000757
758 /* need more space? (at least enough for what we
759 have+the replacement+the rest of the string (starting
760 at the new input position), so we won't have to check space
761 when there are no errors in the rest of the string) */
762 repptr = PyUnicode_AS_UNICODE(repunicode);
763 repsize = PyUnicode_GET_SIZE(repunicode);
764 requiredsize = *outpos + repsize + insize-newpos;
765 if (requiredsize > outsize) {
766 if (requiredsize<2*outsize)
767 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000768 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000769 goto onError;
770 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
771 }
772 *endinpos = newpos;
773 *inptr = input + newpos;
774 Py_UNICODE_COPY(*outptr, repptr, repsize);
775 *outptr += repsize;
776 *outpos += repsize;
777 /* we made it! */
778 res = 0;
779
780 onError:
781 Py_XDECREF(restuple);
782 return res;
783}
784
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000785/* --- UTF-7 Codec -------------------------------------------------------- */
786
787/* see RFC2152 for details */
788
Tim Petersced69f82003-09-16 20:30:58 +0000789static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000790char utf7_special[128] = {
791 /* indicate whether a UTF-7 character is special i.e. cannot be directly
792 encoded:
793 0 - not special
794 1 - special
795 2 - whitespace (optional)
796 3 - RFC2152 Set O (optional) */
797 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
798 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
799 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
800 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
801 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
802 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
803 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
804 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
805
806};
807
808#define SPECIAL(c, encodeO, encodeWS) \
809 (((c)>127 || utf7_special[(c)] == 1) || \
810 (encodeWS && (utf7_special[(c)] == 2)) || \
811 (encodeO && (utf7_special[(c)] == 3)))
812
813#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
814#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
815#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
816 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
817
818#define ENCODE(out, ch, bits) \
819 while (bits >= 6) { \
820 *out++ = B64(ch >> (bits-6)); \
821 bits -= 6; \
822 }
823
824#define DECODE(out, ch, bits, surrogate) \
825 while (bits >= 16) { \
826 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
827 bits -= 16; \
828 if (surrogate) { \
829 /* We have already generated an error for the high surrogate
830 so let's not bother seeing if the low surrogate is correct or not */\
831 surrogate = 0; \
832 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
833 /* This is a surrogate pair. Unfortunately we can't represent \
834 it in a 16-bit character */ \
835 surrogate = 1; \
836 errmsg = "code pairs are not supported"; \
837 goto utf7Error; \
838 } else { \
839 *out++ = outCh; \
840 } \
841 } \
842
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000843PyObject *PyUnicode_DecodeUTF7(const char *s,
844 int size,
845 const char *errors)
846{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000847 const char *starts = s;
848 int startinpos;
849 int endinpos;
850 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000851 const char *e;
852 PyUnicodeObject *unicode;
853 Py_UNICODE *p;
854 const char *errmsg = "";
855 int inShift = 0;
856 unsigned int bitsleft = 0;
857 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000858 int surrogate = 0;
859 PyObject *errorHandler = NULL;
860 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000861
862 unicode = _PyUnicode_New(size);
863 if (!unicode)
864 return NULL;
865 if (size == 0)
866 return (PyObject *)unicode;
867
868 p = unicode->str;
869 e = s + size;
870
871 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000872 Py_UNICODE ch;
873 restart:
874 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000875
876 if (inShift) {
877 if ((ch == '-') || !B64CHAR(ch)) {
878 inShift = 0;
879 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000880
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
882 if (bitsleft >= 6) {
883 /* The shift sequence has a partial character in it. If
884 bitsleft < 6 then we could just classify it as padding
885 but that is not the case here */
886
887 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000888 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000889 }
890 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000891 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000892 here so indicate the potential of a misencoded character. */
893
894 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
895 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
896 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000897 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000898 }
899
900 if (ch == '-') {
901 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000902 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000903 inShift = 1;
904 }
905 } else if (SPECIAL(ch,0,0)) {
906 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000907 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000908 } else {
909 *p++ = ch;
910 }
911 } else {
912 charsleft = (charsleft << 6) | UB64(ch);
913 bitsleft += 6;
914 s++;
915 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
916 }
917 }
918 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000919 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000920 s++;
921 if (s < e && *s == '-') {
922 s++;
923 *p++ = '+';
924 } else
925 {
926 inShift = 1;
927 bitsleft = 0;
928 }
929 }
930 else if (SPECIAL(ch,0,0)) {
931 errmsg = "unexpected special character";
932 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000933 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000934 }
935 else {
936 *p++ = ch;
937 s++;
938 }
939 continue;
940 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000941 outpos = p-PyUnicode_AS_UNICODE(unicode);
942 endinpos = s-starts;
943 if (unicode_decode_call_errorhandler(
944 errors, &errorHandler,
945 "utf7", errmsg,
946 starts, size, &startinpos, &endinpos, &exc, &s,
947 (PyObject **)&unicode, &outpos, &p))
948 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 }
950
951 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000952 outpos = p-PyUnicode_AS_UNICODE(unicode);
953 endinpos = size;
954 if (unicode_decode_call_errorhandler(
955 errors, &errorHandler,
956 "utf7", "unterminated shift sequence",
957 starts, size, &startinpos, &endinpos, &exc, &s,
958 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000959 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000960 if (s < e)
961 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000962 }
963
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000964 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000965 goto onError;
966
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000967 Py_XDECREF(errorHandler);
968 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000969 return (PyObject *)unicode;
970
971onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000972 Py_XDECREF(errorHandler);
973 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974 Py_DECREF(unicode);
975 return NULL;
976}
977
978
979PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
980 int size,
981 int encodeSetO,
982 int encodeWhiteSpace,
983 const char *errors)
984{
985 PyObject *v;
986 /* It might be possible to tighten this worst case */
987 unsigned int cbAllocated = 5 * size;
988 int inShift = 0;
989 int i = 0;
990 unsigned int bitsleft = 0;
991 unsigned long charsleft = 0;
992 char * out;
993 char * start;
994
995 if (size == 0)
996 return PyString_FromStringAndSize(NULL, 0);
997
998 v = PyString_FromStringAndSize(NULL, cbAllocated);
999 if (v == NULL)
1000 return NULL;
1001
1002 start = out = PyString_AS_STRING(v);
1003 for (;i < size; ++i) {
1004 Py_UNICODE ch = s[i];
1005
1006 if (!inShift) {
1007 if (ch == '+') {
1008 *out++ = '+';
1009 *out++ = '-';
1010 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1011 charsleft = ch;
1012 bitsleft = 16;
1013 *out++ = '+';
1014 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1015 inShift = bitsleft > 0;
1016 } else {
1017 *out++ = (char) ch;
1018 }
1019 } else {
1020 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1021 *out++ = B64(charsleft << (6-bitsleft));
1022 charsleft = 0;
1023 bitsleft = 0;
1024 /* Characters not in the BASE64 set implicitly unshift the sequence
1025 so no '-' is required, except if the character is itself a '-' */
1026 if (B64CHAR(ch) || ch == '-') {
1027 *out++ = '-';
1028 }
1029 inShift = 0;
1030 *out++ = (char) ch;
1031 } else {
1032 bitsleft += 16;
1033 charsleft = (charsleft << 16) | ch;
1034 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1035
1036 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001037 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001038 or '-' then the shift sequence will be terminated implicitly and we
1039 don't have to insert a '-'. */
1040
1041 if (bitsleft == 0) {
1042 if (i + 1 < size) {
1043 Py_UNICODE ch2 = s[i+1];
1044
1045 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001046
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001047 } else if (B64CHAR(ch2) || ch2 == '-') {
1048 *out++ = '-';
1049 inShift = 0;
1050 } else {
1051 inShift = 0;
1052 }
1053
1054 }
1055 else {
1056 *out++ = '-';
1057 inShift = 0;
1058 }
1059 }
Tim Petersced69f82003-09-16 20:30:58 +00001060 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062 }
1063 if (bitsleft) {
1064 *out++= B64(charsleft << (6-bitsleft) );
1065 *out++ = '-';
1066 }
1067
Tim Peters5de98422002-04-27 18:44:32 +00001068 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001069 return v;
1070}
1071
1072#undef SPECIAL
1073#undef B64
1074#undef B64CHAR
1075#undef UB64
1076#undef ENCODE
1077#undef DECODE
1078
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079/* --- UTF-8 Codec -------------------------------------------------------- */
1080
Tim Petersced69f82003-09-16 20:30:58 +00001081static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001082char utf8_code_length[256] = {
1083 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1084 illegal prefix. see RFC 2279 for details */
1085 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1086 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1087 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1088 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1089 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1090 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1091 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1092 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1093 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1094 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1095 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1096 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1097 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1098 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1099 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1100 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1101};
1102
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103PyObject *PyUnicode_DecodeUTF8(const char *s,
1104 int size,
1105 const char *errors)
1106{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001107 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001109 int startinpos;
1110 int endinpos;
1111 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 const char *e;
1113 PyUnicodeObject *unicode;
1114 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001115 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001116 PyObject *errorHandler = NULL;
1117 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001118
1119 /* Note: size will always be longer than the resulting Unicode
1120 character count */
1121 unicode = _PyUnicode_New(size);
1122 if (!unicode)
1123 return NULL;
1124 if (size == 0)
1125 return (PyObject *)unicode;
1126
1127 /* Unpack UTF-8 encoded data */
1128 p = unicode->str;
1129 e = s + size;
1130
1131 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001132 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001133
1134 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001135 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001136 s++;
1137 continue;
1138 }
1139
1140 n = utf8_code_length[ch];
1141
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001142 if (s + n > e) {
1143 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001144 startinpos = s-starts;
1145 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001146 goto utf8Error;
1147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148
1149 switch (n) {
1150
1151 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001152 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001153 startinpos = s-starts;
1154 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001155 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156
1157 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001158 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001159 startinpos = s-starts;
1160 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001161 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162
1163 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001164 if ((s[1] & 0xc0) != 0x80) {
1165 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001166 startinpos = s-starts;
1167 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001168 goto utf8Error;
1169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001171 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001172 startinpos = s-starts;
1173 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001174 errmsg = "illegal encoding";
1175 goto utf8Error;
1176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001178 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 break;
1180
1181 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001182 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001183 (s[2] & 0xc0) != 0x80) {
1184 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001185 startinpos = s-starts;
1186 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001187 goto utf8Error;
1188 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001190 if (ch < 0x0800) {
1191 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001192 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001193
1194 XXX For wide builds (UCS-4) we should probably try
1195 to recombine the surrogates into a single code
1196 unit.
1197 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001198 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001199 startinpos = s-starts;
1200 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001201 goto utf8Error;
1202 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001204 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001205 break;
1206
1207 case 4:
1208 if ((s[1] & 0xc0) != 0x80 ||
1209 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001210 (s[3] & 0xc0) != 0x80) {
1211 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001212 startinpos = s-starts;
1213 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001214 goto utf8Error;
1215 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001216 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1217 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1218 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001219 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001220 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001221 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001222 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001223 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001224 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001225 startinpos = s-starts;
1226 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001227 goto utf8Error;
1228 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001229#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001230 *p++ = (Py_UNICODE)ch;
1231#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001232 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001233
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001234 /* translate from 10000..10FFFF to 0..FFFF */
1235 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001236
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001237 /* high surrogate = top 10 bits added to D800 */
1238 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001239
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001240 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001241 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001242#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 break;
1244
1245 default:
1246 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001247 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001248 startinpos = s-starts;
1249 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001250 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 }
1252 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001253 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001254
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001255 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001256 outpos = p-PyUnicode_AS_UNICODE(unicode);
1257 if (unicode_decode_call_errorhandler(
1258 errors, &errorHandler,
1259 "utf8", errmsg,
1260 starts, size, &startinpos, &endinpos, &exc, &s,
1261 (PyObject **)&unicode, &outpos, &p))
1262 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001263 }
1264
1265 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001266 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267 goto onError;
1268
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001269 Py_XDECREF(errorHandler);
1270 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001271 return (PyObject *)unicode;
1272
1273onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001274 Py_XDECREF(errorHandler);
1275 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 Py_DECREF(unicode);
1277 return NULL;
1278}
1279
Tim Peters602f7402002-04-27 18:03:26 +00001280/* Allocation strategy: if the string is short, convert into a stack buffer
1281 and allocate exactly as much space needed at the end. Else allocate the
1282 maximum possible needed (4 result bytes per Unicode character), and return
1283 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001284*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001285PyObject *
1286PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1287 int size,
1288 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289{
Tim Peters602f7402002-04-27 18:03:26 +00001290#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001291
Tim Peters602f7402002-04-27 18:03:26 +00001292 int i; /* index into s of next input byte */
1293 PyObject *v; /* result string object */
1294 char *p; /* next free byte in output buffer */
1295 int nallocated; /* number of result bytes allocated */
1296 int nneeded; /* number of result bytes needed */
1297 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001298
Tim Peters602f7402002-04-27 18:03:26 +00001299 assert(s != NULL);
1300 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001301
Tim Peters602f7402002-04-27 18:03:26 +00001302 if (size <= MAX_SHORT_UNICHARS) {
1303 /* Write into the stack buffer; nallocated can't overflow.
1304 * At the end, we'll allocate exactly as much heap space as it
1305 * turns out we need.
1306 */
1307 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1308 v = NULL; /* will allocate after we're done */
1309 p = stackbuf;
1310 }
1311 else {
1312 /* Overallocate on the heap, and give the excess back at the end. */
1313 nallocated = size * 4;
1314 if (nallocated / 4 != size) /* overflow! */
1315 return PyErr_NoMemory();
1316 v = PyString_FromStringAndSize(NULL, nallocated);
1317 if (v == NULL)
1318 return NULL;
1319 p = PyString_AS_STRING(v);
1320 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001321
Tim Peters602f7402002-04-27 18:03:26 +00001322 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001323 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001324
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001325 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001326 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001328
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001330 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001331 *p++ = (char)(0xc0 | (ch >> 6));
1332 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001333 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001334 else {
Tim Peters602f7402002-04-27 18:03:26 +00001335 /* Encode UCS2 Unicode ordinals */
1336 if (ch < 0x10000) {
1337 /* Special case: check for high surrogate */
1338 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1339 Py_UCS4 ch2 = s[i];
1340 /* Check for low surrogate and combine the two to
1341 form a UCS4 value */
1342 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001343 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001344 i++;
1345 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001346 }
Tim Peters602f7402002-04-27 18:03:26 +00001347 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001349 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001350 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1351 *p++ = (char)(0x80 | (ch & 0x3f));
1352 continue;
1353 }
1354encodeUCS4:
1355 /* Encode UCS4 Unicode ordinals */
1356 *p++ = (char)(0xf0 | (ch >> 18));
1357 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1358 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1359 *p++ = (char)(0x80 | (ch & 0x3f));
1360 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001362
Tim Peters602f7402002-04-27 18:03:26 +00001363 if (v == NULL) {
1364 /* This was stack allocated. */
1365 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1366 assert(nneeded <= nallocated);
1367 v = PyString_FromStringAndSize(stackbuf, nneeded);
1368 }
1369 else {
1370 /* Cut back to size actually needed. */
1371 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1372 assert(nneeded <= nallocated);
1373 _PyString_Resize(&v, nneeded);
1374 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001375 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001376
Tim Peters602f7402002-04-27 18:03:26 +00001377#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378}
1379
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1381{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 if (!PyUnicode_Check(unicode)) {
1383 PyErr_BadArgument();
1384 return NULL;
1385 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001386 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1387 PyUnicode_GET_SIZE(unicode),
1388 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389}
1390
1391/* --- UTF-16 Codec ------------------------------------------------------- */
1392
Tim Peters772747b2001-08-09 22:21:55 +00001393PyObject *
1394PyUnicode_DecodeUTF16(const char *s,
1395 int size,
1396 const char *errors,
1397 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001399 const char *starts = s;
1400 int startinpos;
1401 int endinpos;
1402 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001403 PyUnicodeObject *unicode;
1404 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001405 const unsigned char *q, *e;
1406 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001407 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001408 /* Offsets from q for retrieving byte pairs in the right order. */
1409#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1410 int ihi = 1, ilo = 0;
1411#else
1412 int ihi = 0, ilo = 1;
1413#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414 PyObject *errorHandler = NULL;
1415 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001416
1417 /* Note: size will always be longer than the resulting Unicode
1418 character count */
1419 unicode = _PyUnicode_New(size);
1420 if (!unicode)
1421 return NULL;
1422 if (size == 0)
1423 return (PyObject *)unicode;
1424
1425 /* Unpack UTF-16 encoded data */
1426 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001427 q = (unsigned char *)s;
1428 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429
1430 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001431 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001433 /* Check for BOM marks (U+FEFF) in the input and adjust current
1434 byte order setting accordingly. In native mode, the leading BOM
1435 mark is skipped, in all other modes, it is copied to the output
1436 stream as-is (giving a ZWNBSP character). */
1437 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001438 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001439#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001440 if (bom == 0xFEFF) {
1441 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001442 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001443 }
1444 else if (bom == 0xFFFE) {
1445 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001446 bo = 1;
1447 }
Tim Petersced69f82003-09-16 20:30:58 +00001448#else
Tim Peters772747b2001-08-09 22:21:55 +00001449 if (bom == 0xFEFF) {
1450 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001451 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001452 }
1453 else if (bom == 0xFFFE) {
1454 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001455 bo = -1;
1456 }
1457#endif
1458 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459
Tim Peters772747b2001-08-09 22:21:55 +00001460 if (bo == -1) {
1461 /* force LE */
1462 ihi = 1;
1463 ilo = 0;
1464 }
1465 else if (bo == 1) {
1466 /* force BE */
1467 ihi = 0;
1468 ilo = 1;
1469 }
1470
1471 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001472 Py_UNICODE ch;
1473 /* remaing bytes at the end? (size should be even) */
1474 if (e-q<2) {
1475 errmsg = "truncated data";
1476 startinpos = ((const char *)q)-starts;
1477 endinpos = ((const char *)e)-starts;
1478 goto utf16Error;
1479 /* The remaining input chars are ignored if the callback
1480 chooses to skip the input */
1481 }
1482 ch = (q[ihi] << 8) | q[ilo];
1483
Tim Peters772747b2001-08-09 22:21:55 +00001484 q += 2;
1485
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486 if (ch < 0xD800 || ch > 0xDFFF) {
1487 *p++ = ch;
1488 continue;
1489 }
1490
1491 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001492 if (q >= e) {
1493 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001494 startinpos = (((const char *)q)-2)-starts;
1495 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001496 goto utf16Error;
1497 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001498 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001499 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1500 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001501 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001502#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001503 *p++ = ch;
1504 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001505#else
1506 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001507#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001508 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001509 }
1510 else {
1511 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001512 startinpos = (((const char *)q)-4)-starts;
1513 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001514 goto utf16Error;
1515 }
1516
Guido van Rossumd57fd912000-03-10 22:53:23 +00001517 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001518 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001519 startinpos = (((const char *)q)-2)-starts;
1520 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001521 /* Fall through to report the error */
1522
1523 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001524 outpos = p-PyUnicode_AS_UNICODE(unicode);
1525 if (unicode_decode_call_errorhandler(
1526 errors, &errorHandler,
1527 "utf16", errmsg,
1528 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1529 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001530 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001531 }
1532
1533 if (byteorder)
1534 *byteorder = bo;
1535
1536 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001537 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001538 goto onError;
1539
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 Py_XDECREF(errorHandler);
1541 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 return (PyObject *)unicode;
1543
1544onError:
1545 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001546 Py_XDECREF(errorHandler);
1547 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001548 return NULL;
1549}
1550
Tim Peters772747b2001-08-09 22:21:55 +00001551PyObject *
1552PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1553 int size,
1554 const char *errors,
1555 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001556{
1557 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001558 unsigned char *p;
1559 int i, pairs;
1560 /* Offsets from p for storing byte pairs in the right order. */
1561#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1562 int ihi = 1, ilo = 0;
1563#else
1564 int ihi = 0, ilo = 1;
1565#endif
1566
1567#define STORECHAR(CH) \
1568 do { \
1569 p[ihi] = ((CH) >> 8) & 0xff; \
1570 p[ilo] = (CH) & 0xff; \
1571 p += 2; \
1572 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001573
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001574 for (i = pairs = 0; i < size; i++)
1575 if (s[i] >= 0x10000)
1576 pairs++;
Tim Petersced69f82003-09-16 20:30:58 +00001577 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001578 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001579 if (v == NULL)
1580 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001581
Tim Peters772747b2001-08-09 22:21:55 +00001582 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001583 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001584 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001585 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001586 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001587
1588 if (byteorder == -1) {
1589 /* force LE */
1590 ihi = 1;
1591 ilo = 0;
1592 }
1593 else if (byteorder == 1) {
1594 /* force BE */
1595 ihi = 0;
1596 ilo = 1;
1597 }
1598
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001599 while (size-- > 0) {
1600 Py_UNICODE ch = *s++;
1601 Py_UNICODE ch2 = 0;
1602 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001603 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1604 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001605 }
Tim Peters772747b2001-08-09 22:21:55 +00001606 STORECHAR(ch);
1607 if (ch2)
1608 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001610 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001611#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001612}
1613
1614PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1615{
1616 if (!PyUnicode_Check(unicode)) {
1617 PyErr_BadArgument();
1618 return NULL;
1619 }
1620 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1621 PyUnicode_GET_SIZE(unicode),
1622 NULL,
1623 0);
1624}
1625
1626/* --- Unicode Escape Codec ----------------------------------------------- */
1627
Fredrik Lundh06d12682001-01-24 07:59:11 +00001628static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001629
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1631 int size,
1632 const char *errors)
1633{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001634 const char *starts = s;
1635 int startinpos;
1636 int endinpos;
1637 int outpos;
1638 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001639 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001640 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001641 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001642 char* message;
1643 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001644 PyObject *errorHandler = NULL;
1645 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001646
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 /* Escaped strings will always be longer than the resulting
1648 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 length after conversion to the true value.
1650 (but if the error callback returns a long replacement string
1651 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652 v = _PyUnicode_New(size);
1653 if (v == NULL)
1654 goto onError;
1655 if (size == 0)
1656 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001657
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001658 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001659 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001660
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 while (s < end) {
1662 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001663 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001664 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665
1666 /* Non-escape characters are interpreted as Unicode ordinals */
1667 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001668 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 continue;
1670 }
1671
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001672 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 /* \ - Escapes */
1674 s++;
1675 switch (*s++) {
1676
1677 /* \x escapes */
1678 case '\n': break;
1679 case '\\': *p++ = '\\'; break;
1680 case '\'': *p++ = '\''; break;
1681 case '\"': *p++ = '\"'; break;
1682 case 'b': *p++ = '\b'; break;
1683 case 'f': *p++ = '\014'; break; /* FF */
1684 case 't': *p++ = '\t'; break;
1685 case 'n': *p++ = '\n'; break;
1686 case 'r': *p++ = '\r'; break;
1687 case 'v': *p++ = '\013'; break; /* VT */
1688 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1689
1690 /* \OOO (octal) escapes */
1691 case '0': case '1': case '2': case '3':
1692 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001693 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001695 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001697 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001699 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700 break;
1701
Fredrik Lundhccc74732001-02-18 22:13:49 +00001702 /* hex escapes */
1703 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001705 digits = 2;
1706 message = "truncated \\xXX escape";
1707 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708
Fredrik Lundhccc74732001-02-18 22:13:49 +00001709 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001711 digits = 4;
1712 message = "truncated \\uXXXX escape";
1713 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714
Fredrik Lundhccc74732001-02-18 22:13:49 +00001715 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001716 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001717 digits = 8;
1718 message = "truncated \\UXXXXXXXX escape";
1719 hexescape:
1720 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001721 outpos = p-PyUnicode_AS_UNICODE(v);
1722 if (s+digits>end) {
1723 endinpos = size;
1724 if (unicode_decode_call_errorhandler(
1725 errors, &errorHandler,
1726 "unicodeescape", "end of string in escape sequence",
1727 starts, size, &startinpos, &endinpos, &exc, &s,
1728 (PyObject **)&v, &outpos, &p))
1729 goto onError;
1730 goto nextByte;
1731 }
1732 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001733 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001734 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001735 endinpos = (s+i+1)-starts;
1736 if (unicode_decode_call_errorhandler(
1737 errors, &errorHandler,
1738 "unicodeescape", message,
1739 starts, size, &startinpos, &endinpos, &exc, &s,
1740 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001741 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001743 }
1744 chr = (chr<<4) & ~0xF;
1745 if (c >= '0' && c <= '9')
1746 chr += c - '0';
1747 else if (c >= 'a' && c <= 'f')
1748 chr += 10 + c - 'a';
1749 else
1750 chr += 10 + c - 'A';
1751 }
1752 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001753 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001754 /* _decoding_error will have already written into the
1755 target buffer. */
1756 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001757 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001758 /* when we get here, chr is a 32-bit unicode character */
1759 if (chr <= 0xffff)
1760 /* UCS-2 character */
1761 *p++ = (Py_UNICODE) chr;
1762 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001763 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001764 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001765#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001766 *p++ = chr;
1767#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001768 chr -= 0x10000L;
1769 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001770 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001771#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001772 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001773 endinpos = s-starts;
1774 outpos = p-PyUnicode_AS_UNICODE(v);
1775 if (unicode_decode_call_errorhandler(
1776 errors, &errorHandler,
1777 "unicodeescape", "illegal Unicode character",
1778 starts, size, &startinpos, &endinpos, &exc, &s,
1779 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001780 goto onError;
1781 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001782 break;
1783
1784 /* \N{name} */
1785 case 'N':
1786 message = "malformed \\N character escape";
1787 if (ucnhash_CAPI == NULL) {
1788 /* load the unicode data module */
1789 PyObject *m, *v;
1790 m = PyImport_ImportModule("unicodedata");
1791 if (m == NULL)
1792 goto ucnhashError;
1793 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1794 Py_DECREF(m);
1795 if (v == NULL)
1796 goto ucnhashError;
1797 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1798 Py_DECREF(v);
1799 if (ucnhash_CAPI == NULL)
1800 goto ucnhashError;
1801 }
1802 if (*s == '{') {
1803 const char *start = s+1;
1804 /* look for the closing brace */
1805 while (*s != '}' && s < end)
1806 s++;
1807 if (s > start && s < end && *s == '}') {
1808 /* found a name. look it up in the unicode database */
1809 message = "unknown Unicode character name";
1810 s++;
1811 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1812 goto store;
1813 }
1814 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001815 endinpos = s-starts;
1816 outpos = p-PyUnicode_AS_UNICODE(v);
1817 if (unicode_decode_call_errorhandler(
1818 errors, &errorHandler,
1819 "unicodeescape", message,
1820 starts, size, &startinpos, &endinpos, &exc, &s,
1821 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001822 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001823 break;
1824
1825 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001826 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001827 message = "\\ at end of string";
1828 s--;
1829 endinpos = s-starts;
1830 outpos = p-PyUnicode_AS_UNICODE(v);
1831 if (unicode_decode_call_errorhandler(
1832 errors, &errorHandler,
1833 "unicodeescape", message,
1834 starts, size, &startinpos, &endinpos, &exc, &s,
1835 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001836 goto onError;
1837 }
1838 else {
1839 *p++ = '\\';
1840 *p++ = (unsigned char)s[-1];
1841 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001842 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001843 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001844 nextByte:
1845 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001847 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001848 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001849 Py_XDECREF(errorHandler);
1850 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001852
Fredrik Lundhccc74732001-02-18 22:13:49 +00001853ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001854 PyErr_SetString(
1855 PyExc_UnicodeError,
1856 "\\N escapes not supported (can't load unicodedata module)"
1857 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 Py_XDECREF(errorHandler);
1859 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001860 return NULL;
1861
Fredrik Lundhccc74732001-02-18 22:13:49 +00001862onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001863 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001864 Py_XDECREF(errorHandler);
1865 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866 return NULL;
1867}
1868
1869/* Return a Unicode-Escape string version of the Unicode object.
1870
1871 If quotes is true, the string is enclosed in u"" or u'' quotes as
1872 appropriate.
1873
1874*/
1875
Barry Warsaw51ac5802000-03-20 16:36:48 +00001876static const Py_UNICODE *findchar(const Py_UNICODE *s,
1877 int size,
1878 Py_UNICODE ch);
1879
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880static
1881PyObject *unicodeescape_string(const Py_UNICODE *s,
1882 int size,
1883 int quotes)
1884{
1885 PyObject *repr;
1886 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001887
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001888 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889
1890 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1891 if (repr == NULL)
1892 return NULL;
1893
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001894 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895
1896 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001897 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001898 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 !findchar(s, size, '"')) ? '"' : '\'';
1900 }
1901 while (size-- > 0) {
1902 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001903
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001905 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001906 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001907 *p++ = '\\';
1908 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001909 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001910 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001911
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001912#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001913 /* Map 21-bit characters to '\U00xxxxxx' */
1914 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001915 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00001916
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001917 /* Resize the string if necessary */
1918 if (offset + 12 > PyString_GET_SIZE(repr)) {
1919 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001920 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001921 p = PyString_AS_STRING(repr) + offset;
1922 }
1923
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001924 *p++ = '\\';
1925 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001926 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1927 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1928 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1929 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1930 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1931 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1932 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001933 *p++ = hexdigit[ch & 0x0000000F];
1934 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001935 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001936#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001937 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1938 else if (ch >= 0xD800 && ch < 0xDC00) {
1939 Py_UNICODE ch2;
1940 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00001941
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001942 ch2 = *s++;
1943 size--;
1944 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1945 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1946 *p++ = '\\';
1947 *p++ = 'U';
1948 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1949 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1950 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1951 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1952 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1953 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1954 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1955 *p++ = hexdigit[ucs & 0x0000000F];
1956 continue;
1957 }
1958 /* Fall through: isolated surrogates are copied as-is */
1959 s--;
1960 size++;
1961 }
1962
Guido van Rossumd57fd912000-03-10 22:53:23 +00001963 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001964 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001965 *p++ = '\\';
1966 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001967 *p++ = hexdigit[(ch >> 12) & 0x000F];
1968 *p++ = hexdigit[(ch >> 8) & 0x000F];
1969 *p++ = hexdigit[(ch >> 4) & 0x000F];
1970 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001972
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001973 /* Map special whitespace to '\t', \n', '\r' */
1974 else if (ch == '\t') {
1975 *p++ = '\\';
1976 *p++ = 't';
1977 }
1978 else if (ch == '\n') {
1979 *p++ = '\\';
1980 *p++ = 'n';
1981 }
1982 else if (ch == '\r') {
1983 *p++ = '\\';
1984 *p++ = 'r';
1985 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001986
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001987 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001988 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001990 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001991 *p++ = hexdigit[(ch >> 4) & 0x000F];
1992 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00001993 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001994
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995 /* Copy everything else as-is */
1996 else
1997 *p++ = (char) ch;
1998 }
1999 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002000 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001
2002 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002003 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002004 return repr;
2005}
2006
2007PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2008 int size)
2009{
2010 return unicodeescape_string(s, size, 0);
2011}
2012
2013PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2014{
2015 if (!PyUnicode_Check(unicode)) {
2016 PyErr_BadArgument();
2017 return NULL;
2018 }
2019 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2020 PyUnicode_GET_SIZE(unicode));
2021}
2022
2023/* --- Raw Unicode Escape Codec ------------------------------------------- */
2024
2025PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2026 int size,
2027 const char *errors)
2028{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002029 const char *starts = s;
2030 int startinpos;
2031 int endinpos;
2032 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002034 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 const char *end;
2036 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002037 PyObject *errorHandler = NULL;
2038 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002039
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040 /* Escaped strings will always be longer than the resulting
2041 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002042 length after conversion to the true value. (But decoding error
2043 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 v = _PyUnicode_New(size);
2045 if (v == NULL)
2046 goto onError;
2047 if (size == 0)
2048 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002049 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 end = s + size;
2051 while (s < end) {
2052 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002053 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002055 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056
2057 /* Non-escape characters are interpreted as Unicode ordinals */
2058 if (*s != '\\') {
2059 *p++ = (unsigned char)*s++;
2060 continue;
2061 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002062 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063
2064 /* \u-escapes are only interpreted iff the number of leading
2065 backslashes if odd */
2066 bs = s;
2067 for (;s < end;) {
2068 if (*s != '\\')
2069 break;
2070 *p++ = (unsigned char)*s++;
2071 }
2072 if (((s - bs) & 1) == 0 ||
2073 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002074 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075 continue;
2076 }
2077 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002078 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079 s++;
2080
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002081 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002082 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002083 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002084 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002086 endinpos = s-starts;
2087 if (unicode_decode_call_errorhandler(
2088 errors, &errorHandler,
2089 "rawunicodeescape", "truncated \\uXXXX",
2090 starts, size, &startinpos, &endinpos, &exc, &s,
2091 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002093 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 }
2095 x = (x<<4) & ~0xF;
2096 if (c >= '0' && c <= '9')
2097 x += c - '0';
2098 else if (c >= 'a' && c <= 'f')
2099 x += 10 + c - 'a';
2100 else
2101 x += 10 + c - 'A';
2102 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002103#ifndef Py_UNICODE_WIDE
2104 if (x > 0x10000) {
2105 if (unicode_decode_call_errorhandler(
2106 errors, &errorHandler,
2107 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2108 starts, size, &startinpos, &endinpos, &exc, &s,
2109 (PyObject **)&v, &outpos, &p))
2110 goto onError;
2111 }
2112#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002113 *p++ = x;
2114 nextByte:
2115 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002117 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002118 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002119 Py_XDECREF(errorHandler);
2120 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002122
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 onError:
2124 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002125 Py_XDECREF(errorHandler);
2126 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127 return NULL;
2128}
2129
2130PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2131 int size)
2132{
2133 PyObject *repr;
2134 char *p;
2135 char *q;
2136
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002137 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002138
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002139#ifdef Py_UNICODE_WIDE
2140 repr = PyString_FromStringAndSize(NULL, 10 * size);
2141#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002143#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 if (repr == NULL)
2145 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002146 if (size == 0)
2147 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148
2149 p = q = PyString_AS_STRING(repr);
2150 while (size-- > 0) {
2151 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002152#ifdef Py_UNICODE_WIDE
2153 /* Map 32-bit characters to '\Uxxxxxxxx' */
2154 if (ch >= 0x10000) {
2155 *p++ = '\\';
2156 *p++ = 'U';
2157 *p++ = hexdigit[(ch >> 28) & 0xf];
2158 *p++ = hexdigit[(ch >> 24) & 0xf];
2159 *p++ = hexdigit[(ch >> 20) & 0xf];
2160 *p++ = hexdigit[(ch >> 16) & 0xf];
2161 *p++ = hexdigit[(ch >> 12) & 0xf];
2162 *p++ = hexdigit[(ch >> 8) & 0xf];
2163 *p++ = hexdigit[(ch >> 4) & 0xf];
2164 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002165 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002166 else
2167#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168 /* Map 16-bit characters to '\uxxxx' */
2169 if (ch >= 256) {
2170 *p++ = '\\';
2171 *p++ = 'u';
2172 *p++ = hexdigit[(ch >> 12) & 0xf];
2173 *p++ = hexdigit[(ch >> 8) & 0xf];
2174 *p++ = hexdigit[(ch >> 4) & 0xf];
2175 *p++ = hexdigit[ch & 15];
2176 }
2177 /* Copy everything else as-is */
2178 else
2179 *p++ = (char) ch;
2180 }
2181 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002182 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 return repr;
2184}
2185
2186PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2187{
2188 if (!PyUnicode_Check(unicode)) {
2189 PyErr_BadArgument();
2190 return NULL;
2191 }
2192 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2193 PyUnicode_GET_SIZE(unicode));
2194}
2195
2196/* --- Latin-1 Codec ------------------------------------------------------ */
2197
2198PyObject *PyUnicode_DecodeLatin1(const char *s,
2199 int size,
2200 const char *errors)
2201{
2202 PyUnicodeObject *v;
2203 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002204
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002206 if (size == 1 && *(unsigned char*)s < 256) {
2207 Py_UNICODE r = *(unsigned char*)s;
2208 return PyUnicode_FromUnicode(&r, 1);
2209 }
2210
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 v = _PyUnicode_New(size);
2212 if (v == NULL)
2213 goto onError;
2214 if (size == 0)
2215 return (PyObject *)v;
2216 p = PyUnicode_AS_UNICODE(v);
2217 while (size-- > 0)
2218 *p++ = (unsigned char)*s++;
2219 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002220
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221 onError:
2222 Py_XDECREF(v);
2223 return NULL;
2224}
2225
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002226/* create or adjust a UnicodeEncodeError */
2227static void make_encode_exception(PyObject **exceptionObject,
2228 const char *encoding,
2229 const Py_UNICODE *unicode, int size,
2230 int startpos, int endpos,
2231 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002233 if (*exceptionObject == NULL) {
2234 *exceptionObject = PyUnicodeEncodeError_Create(
2235 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236 }
2237 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002238 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2239 goto onError;
2240 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2241 goto onError;
2242 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2243 goto onError;
2244 return;
2245 onError:
2246 Py_DECREF(*exceptionObject);
2247 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 }
2249}
2250
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002251/* raises a UnicodeEncodeError */
2252static void raise_encode_exception(PyObject **exceptionObject,
2253 const char *encoding,
2254 const Py_UNICODE *unicode, int size,
2255 int startpos, int endpos,
2256 const char *reason)
2257{
2258 make_encode_exception(exceptionObject,
2259 encoding, unicode, size, startpos, endpos, reason);
2260 if (*exceptionObject != NULL)
2261 PyCodec_StrictErrors(*exceptionObject);
2262}
2263
2264/* error handling callback helper:
2265 build arguments, call the callback and check the arguments,
2266 put the result into newpos and return the replacement string, which
2267 has to be freed by the caller */
2268static PyObject *unicode_encode_call_errorhandler(const char *errors,
2269 PyObject **errorHandler,
2270 const char *encoding, const char *reason,
2271 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2272 int startpos, int endpos,
2273 int *newpos)
2274{
2275 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2276
2277 PyObject *restuple;
2278 PyObject *resunicode;
2279
2280 if (*errorHandler == NULL) {
2281 *errorHandler = PyCodec_LookupError(errors);
2282 if (*errorHandler == NULL)
2283 return NULL;
2284 }
2285
2286 make_encode_exception(exceptionObject,
2287 encoding, unicode, size, startpos, endpos, reason);
2288 if (*exceptionObject == NULL)
2289 return NULL;
2290
2291 restuple = PyObject_CallFunctionObjArgs(
2292 *errorHandler, *exceptionObject, NULL);
2293 if (restuple == NULL)
2294 return NULL;
2295 if (!PyTuple_Check(restuple)) {
2296 PyErr_Format(PyExc_TypeError, &argparse[4]);
2297 Py_DECREF(restuple);
2298 return NULL;
2299 }
2300 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2301 &resunicode, newpos)) {
2302 Py_DECREF(restuple);
2303 return NULL;
2304 }
2305 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002306 *newpos = size+*newpos;
2307 if (*newpos<0 || *newpos>size) {
2308 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2309 Py_DECREF(restuple);
2310 return NULL;
2311 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002312 Py_INCREF(resunicode);
2313 Py_DECREF(restuple);
2314 return resunicode;
2315}
2316
2317static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2318 int size,
2319 const char *errors,
2320 int limit)
2321{
2322 /* output object */
2323 PyObject *res;
2324 /* pointers to the beginning and end+1 of input */
2325 const Py_UNICODE *startp = p;
2326 const Py_UNICODE *endp = p + size;
2327 /* pointer to the beginning of the unencodable characters */
2328 /* const Py_UNICODE *badp = NULL; */
2329 /* pointer into the output */
2330 char *str;
2331 /* current output position */
2332 int respos = 0;
2333 int ressize;
2334 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2335 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2336 PyObject *errorHandler = NULL;
2337 PyObject *exc = NULL;
2338 /* the following variable is used for caching string comparisons
2339 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2340 int known_errorHandler = -1;
2341
2342 /* allocate enough for a simple encoding without
2343 replacements, if we need more, we'll resize */
2344 res = PyString_FromStringAndSize(NULL, size);
2345 if (res == NULL)
2346 goto onError;
2347 if (size == 0)
2348 return res;
2349 str = PyString_AS_STRING(res);
2350 ressize = size;
2351
2352 while (p<endp) {
2353 Py_UNICODE c = *p;
2354
2355 /* can we encode this? */
2356 if (c<limit) {
2357 /* no overflow check, because we know that the space is enough */
2358 *str++ = (char)c;
2359 ++p;
2360 }
2361 else {
2362 int unicodepos = p-startp;
2363 int requiredsize;
2364 PyObject *repunicode;
2365 int repsize;
2366 int newpos;
2367 int respos;
2368 Py_UNICODE *uni2;
2369 /* startpos for collecting unencodable chars */
2370 const Py_UNICODE *collstart = p;
2371 const Py_UNICODE *collend = p;
2372 /* find all unecodable characters */
2373 while ((collend < endp) && ((*collend)>=limit))
2374 ++collend;
2375 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2376 if (known_errorHandler==-1) {
2377 if ((errors==NULL) || (!strcmp(errors, "strict")))
2378 known_errorHandler = 1;
2379 else if (!strcmp(errors, "replace"))
2380 known_errorHandler = 2;
2381 else if (!strcmp(errors, "ignore"))
2382 known_errorHandler = 3;
2383 else if (!strcmp(errors, "xmlcharrefreplace"))
2384 known_errorHandler = 4;
2385 else
2386 known_errorHandler = 0;
2387 }
2388 switch (known_errorHandler) {
2389 case 1: /* strict */
2390 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2391 goto onError;
2392 case 2: /* replace */
2393 while (collstart++<collend)
2394 *str++ = '?'; /* fall through */
2395 case 3: /* ignore */
2396 p = collend;
2397 break;
2398 case 4: /* xmlcharrefreplace */
2399 respos = str-PyString_AS_STRING(res);
2400 /* determine replacement size (temporarily (mis)uses p) */
2401 for (p = collstart, repsize = 0; p < collend; ++p) {
2402 if (*p<10)
2403 repsize += 2+1+1;
2404 else if (*p<100)
2405 repsize += 2+2+1;
2406 else if (*p<1000)
2407 repsize += 2+3+1;
2408 else if (*p<10000)
2409 repsize += 2+4+1;
2410 else if (*p<100000)
2411 repsize += 2+5+1;
2412 else if (*p<1000000)
2413 repsize += 2+6+1;
2414 else
2415 repsize += 2+7+1;
2416 }
2417 requiredsize = respos+repsize+(endp-collend);
2418 if (requiredsize > ressize) {
2419 if (requiredsize<2*ressize)
2420 requiredsize = 2*ressize;
2421 if (_PyString_Resize(&res, requiredsize))
2422 goto onError;
2423 str = PyString_AS_STRING(res) + respos;
2424 ressize = requiredsize;
2425 }
2426 /* generate replacement (temporarily (mis)uses p) */
2427 for (p = collstart; p < collend; ++p) {
2428 str += sprintf(str, "&#%d;", (int)*p);
2429 }
2430 p = collend;
2431 break;
2432 default:
2433 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2434 encoding, reason, startp, size, &exc,
2435 collstart-startp, collend-startp, &newpos);
2436 if (repunicode == NULL)
2437 goto onError;
2438 /* need more space? (at least enough for what we
2439 have+the replacement+the rest of the string, so
2440 we won't have to check space for encodable characters) */
2441 respos = str-PyString_AS_STRING(res);
2442 repsize = PyUnicode_GET_SIZE(repunicode);
2443 requiredsize = respos+repsize+(endp-collend);
2444 if (requiredsize > ressize) {
2445 if (requiredsize<2*ressize)
2446 requiredsize = 2*ressize;
2447 if (_PyString_Resize(&res, requiredsize)) {
2448 Py_DECREF(repunicode);
2449 goto onError;
2450 }
2451 str = PyString_AS_STRING(res) + respos;
2452 ressize = requiredsize;
2453 }
2454 /* check if there is anything unencodable in the replacement
2455 and copy it to the output */
2456 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2457 c = *uni2;
2458 if (c >= limit) {
2459 raise_encode_exception(&exc, encoding, startp, size,
2460 unicodepos, unicodepos+1, reason);
2461 Py_DECREF(repunicode);
2462 goto onError;
2463 }
2464 *str = (char)c;
2465 }
2466 p = startp + newpos;
2467 Py_DECREF(repunicode);
2468 }
2469 }
2470 }
2471 /* Resize if we allocated to much */
2472 respos = str-PyString_AS_STRING(res);
2473 if (respos<ressize)
2474 /* If this falls res will be NULL */
2475 _PyString_Resize(&res, respos);
2476 Py_XDECREF(errorHandler);
2477 Py_XDECREF(exc);
2478 return res;
2479
2480 onError:
2481 Py_XDECREF(res);
2482 Py_XDECREF(errorHandler);
2483 Py_XDECREF(exc);
2484 return NULL;
2485}
2486
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2488 int size,
2489 const char *errors)
2490{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002491 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492}
2493
2494PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2495{
2496 if (!PyUnicode_Check(unicode)) {
2497 PyErr_BadArgument();
2498 return NULL;
2499 }
2500 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2501 PyUnicode_GET_SIZE(unicode),
2502 NULL);
2503}
2504
2505/* --- 7-bit ASCII Codec -------------------------------------------------- */
2506
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507PyObject *PyUnicode_DecodeASCII(const char *s,
2508 int size,
2509 const char *errors)
2510{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002511 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512 PyUnicodeObject *v;
2513 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002514 int startinpos;
2515 int endinpos;
2516 int outpos;
2517 const char *e;
2518 PyObject *errorHandler = NULL;
2519 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002520
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002522 if (size == 1 && *(unsigned char*)s < 128) {
2523 Py_UNICODE r = *(unsigned char*)s;
2524 return PyUnicode_FromUnicode(&r, 1);
2525 }
Tim Petersced69f82003-09-16 20:30:58 +00002526
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527 v = _PyUnicode_New(size);
2528 if (v == NULL)
2529 goto onError;
2530 if (size == 0)
2531 return (PyObject *)v;
2532 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002533 e = s + size;
2534 while (s < e) {
2535 register unsigned char c = (unsigned char)*s;
2536 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002538 ++s;
2539 }
2540 else {
2541 startinpos = s-starts;
2542 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002543 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002544 if (unicode_decode_call_errorhandler(
2545 errors, &errorHandler,
2546 "ascii", "ordinal not in range(128)",
2547 starts, size, &startinpos, &endinpos, &exc, &s,
2548 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002552 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002553 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002554 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002555 Py_XDECREF(errorHandler);
2556 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002558
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 onError:
2560 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002561 Py_XDECREF(errorHandler);
2562 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563 return NULL;
2564}
2565
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2567 int size,
2568 const char *errors)
2569{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002570 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002571}
2572
2573PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2574{
2575 if (!PyUnicode_Check(unicode)) {
2576 PyErr_BadArgument();
2577 return NULL;
2578 }
2579 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2580 PyUnicode_GET_SIZE(unicode),
2581 NULL);
2582}
2583
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002584#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002585
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002586/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002587
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002588PyObject *PyUnicode_DecodeMBCS(const char *s,
2589 int size,
2590 const char *errors)
2591{
2592 PyUnicodeObject *v;
2593 Py_UNICODE *p;
2594
2595 /* First get the size of the result */
2596 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002597 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002598 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2599
2600 v = _PyUnicode_New(usize);
2601 if (v == NULL)
2602 return NULL;
2603 if (usize == 0)
2604 return (PyObject *)v;
2605 p = PyUnicode_AS_UNICODE(v);
2606 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2607 Py_DECREF(v);
2608 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2609 }
2610
2611 return (PyObject *)v;
2612}
2613
2614PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2615 int size,
2616 const char *errors)
2617{
2618 PyObject *repr;
2619 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002620 DWORD mbcssize;
2621
2622 /* If there are no characters, bail now! */
2623 if (size==0)
2624 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002625
2626 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002627 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002628 if (mbcssize==0)
2629 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2630
2631 repr = PyString_FromStringAndSize(NULL, mbcssize);
2632 if (repr == NULL)
2633 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002634 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002635 return repr;
2636
2637 /* Do the conversion */
2638 s = PyString_AS_STRING(repr);
2639 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2640 Py_DECREF(repr);
2641 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2642 }
2643 return repr;
2644}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002645
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002646PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2647{
2648 if (!PyUnicode_Check(unicode)) {
2649 PyErr_BadArgument();
2650 return NULL;
2651 }
2652 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2653 PyUnicode_GET_SIZE(unicode),
2654 NULL);
2655}
2656
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002657#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002658
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659/* --- Character Mapping Codec -------------------------------------------- */
2660
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661PyObject *PyUnicode_DecodeCharmap(const char *s,
2662 int size,
2663 PyObject *mapping,
2664 const char *errors)
2665{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002666 const char *starts = s;
2667 int startinpos;
2668 int endinpos;
2669 int outpos;
2670 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 PyUnicodeObject *v;
2672 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002673 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002674 PyObject *errorHandler = NULL;
2675 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002676
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 /* Default to Latin-1 */
2678 if (mapping == NULL)
2679 return PyUnicode_DecodeLatin1(s, size, errors);
2680
2681 v = _PyUnicode_New(size);
2682 if (v == NULL)
2683 goto onError;
2684 if (size == 0)
2685 return (PyObject *)v;
2686 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002687 e = s + size;
2688 while (s < e) {
2689 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 PyObject *w, *x;
2691
2692 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2693 w = PyInt_FromLong((long)ch);
2694 if (w == NULL)
2695 goto onError;
2696 x = PyObject_GetItem(mapping, w);
2697 Py_DECREF(w);
2698 if (x == NULL) {
2699 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002700 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002702 x = Py_None;
2703 Py_INCREF(x);
2704 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002705 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 }
2707
2708 /* Apply mapping */
2709 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002710 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002711 if (value < 0 || value > 65535) {
2712 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002713 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714 Py_DECREF(x);
2715 goto onError;
2716 }
2717 *p++ = (Py_UNICODE)value;
2718 }
2719 else if (x == Py_None) {
2720 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002721 outpos = p-PyUnicode_AS_UNICODE(v);
2722 startinpos = s-starts;
2723 endinpos = startinpos+1;
2724 if (unicode_decode_call_errorhandler(
2725 errors, &errorHandler,
2726 "charmap", "character maps to <undefined>",
2727 starts, size, &startinpos, &endinpos, &exc, &s,
2728 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729 Py_DECREF(x);
2730 goto onError;
2731 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002732 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733 }
2734 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002735 int targetsize = PyUnicode_GET_SIZE(x);
2736
2737 if (targetsize == 1)
2738 /* 1-1 mapping */
2739 *p++ = *PyUnicode_AS_UNICODE(x);
2740
2741 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002743 if (targetsize > extrachars) {
2744 /* resize first */
2745 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2746 int needed = (targetsize - extrachars) + \
2747 (targetsize << 2);
2748 extrachars += needed;
Tim Petersced69f82003-09-16 20:30:58 +00002749 if (_PyUnicode_Resize(&v,
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002750 PyUnicode_GET_SIZE(v) + needed) < 0) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002751 Py_DECREF(x);
2752 goto onError;
2753 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002754 p = PyUnicode_AS_UNICODE(v) + oldpos;
2755 }
2756 Py_UNICODE_COPY(p,
2757 PyUnicode_AS_UNICODE(x),
2758 targetsize);
2759 p += targetsize;
2760 extrachars -= targetsize;
2761 }
2762 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763 }
2764 else {
2765 /* wrong return value */
2766 PyErr_SetString(PyExc_TypeError,
2767 "character mapping must return integer, None or unicode");
2768 Py_DECREF(x);
2769 goto onError;
2770 }
2771 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002772 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 }
2774 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002775 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002777 Py_XDECREF(errorHandler);
2778 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002780
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002782 Py_XDECREF(errorHandler);
2783 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 Py_XDECREF(v);
2785 return NULL;
2786}
2787
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002788/* Lookup the character ch in the mapping. If the character
2789 can't be found, Py_None is returned (or NULL, if another
2790 error occured). */
2791static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002793 PyObject *w = PyInt_FromLong((long)c);
2794 PyObject *x;
2795
2796 if (w == NULL)
2797 return NULL;
2798 x = PyObject_GetItem(mapping, w);
2799 Py_DECREF(w);
2800 if (x == NULL) {
2801 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2802 /* No mapping found means: mapping is undefined. */
2803 PyErr_Clear();
2804 x = Py_None;
2805 Py_INCREF(x);
2806 return x;
2807 } else
2808 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002810 else if (x == Py_None)
2811 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002812 else if (PyInt_Check(x)) {
2813 long value = PyInt_AS_LONG(x);
2814 if (value < 0 || value > 255) {
2815 PyErr_SetString(PyExc_TypeError,
2816 "character mapping must be in range(256)");
2817 Py_DECREF(x);
2818 return NULL;
2819 }
2820 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822 else if (PyString_Check(x))
2823 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002825 /* wrong return value */
2826 PyErr_SetString(PyExc_TypeError,
2827 "character mapping must return integer, None or str");
2828 Py_DECREF(x);
2829 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830 }
2831}
2832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002833/* lookup the character, put the result in the output string and adjust
2834 various state variables. Reallocate the output string if not enough
2835 space is available. Return a new reference to the object that
2836 was put in the output buffer, or Py_None, if the mapping was undefined
2837 (in which case no character was written) or NULL, if a
2838 reallocation error ocurred. The called must decref the result */
2839static
2840PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2841 PyObject **outobj, int *outpos)
2842{
2843 PyObject *rep = charmapencode_lookup(c, mapping);
2844
2845 if (rep==NULL)
2846 return NULL;
2847 else if (rep==Py_None)
2848 return rep;
2849 else {
2850 char *outstart = PyString_AS_STRING(*outobj);
2851 int outsize = PyString_GET_SIZE(*outobj);
2852 if (PyInt_Check(rep)) {
2853 int requiredsize = *outpos+1;
2854 if (outsize<requiredsize) {
2855 /* exponentially overallocate to minimize reallocations */
2856 if (requiredsize < 2*outsize)
2857 requiredsize = 2*outsize;
2858 if (_PyString_Resize(outobj, requiredsize)) {
2859 Py_DECREF(rep);
2860 return NULL;
2861 }
2862 outstart = PyString_AS_STRING(*outobj);
2863 }
2864 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2865 }
2866 else {
2867 const char *repchars = PyString_AS_STRING(rep);
2868 int repsize = PyString_GET_SIZE(rep);
2869 int requiredsize = *outpos+repsize;
2870 if (outsize<requiredsize) {
2871 /* exponentially overallocate to minimize reallocations */
2872 if (requiredsize < 2*outsize)
2873 requiredsize = 2*outsize;
2874 if (_PyString_Resize(outobj, requiredsize)) {
2875 Py_DECREF(rep);
2876 return NULL;
2877 }
2878 outstart = PyString_AS_STRING(*outobj);
2879 }
2880 memcpy(outstart + *outpos, repchars, repsize);
2881 *outpos += repsize;
2882 }
2883 }
2884 return rep;
2885}
2886
2887/* handle an error in PyUnicode_EncodeCharmap
2888 Return 0 on success, -1 on error */
2889static
2890int charmap_encoding_error(
2891 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2892 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002893 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002894 PyObject **res, int *respos)
2895{
2896 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2897 int repsize;
2898 int newpos;
2899 Py_UNICODE *uni2;
2900 /* startpos for collecting unencodable chars */
2901 int collstartpos = *inpos;
2902 int collendpos = *inpos+1;
2903 int collpos;
2904 char *encoding = "charmap";
2905 char *reason = "character maps to <undefined>";
2906
2907 PyObject *x;
2908 /* find all unencodable characters */
2909 while (collendpos < size) {
2910 x = charmapencode_lookup(p[collendpos], mapping);
2911 if (x==NULL)
2912 return -1;
2913 else if (x!=Py_None) {
2914 Py_DECREF(x);
2915 break;
2916 }
2917 Py_DECREF(x);
2918 ++collendpos;
2919 }
2920 /* cache callback name lookup
2921 * (if not done yet, i.e. it's the first error) */
2922 if (*known_errorHandler==-1) {
2923 if ((errors==NULL) || (!strcmp(errors, "strict")))
2924 *known_errorHandler = 1;
2925 else if (!strcmp(errors, "replace"))
2926 *known_errorHandler = 2;
2927 else if (!strcmp(errors, "ignore"))
2928 *known_errorHandler = 3;
2929 else if (!strcmp(errors, "xmlcharrefreplace"))
2930 *known_errorHandler = 4;
2931 else
2932 *known_errorHandler = 0;
2933 }
2934 switch (*known_errorHandler) {
2935 case 1: /* strict */
2936 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2937 return -1;
2938 case 2: /* replace */
2939 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2940 x = charmapencode_output('?', mapping, res, respos);
2941 if (x==NULL) {
2942 return -1;
2943 }
2944 else if (x==Py_None) {
2945 Py_DECREF(x);
2946 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2947 return -1;
2948 }
2949 Py_DECREF(x);
2950 }
2951 /* fall through */
2952 case 3: /* ignore */
2953 *inpos = collendpos;
2954 break;
2955 case 4: /* xmlcharrefreplace */
2956 /* generate replacement (temporarily (mis)uses p) */
2957 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2958 char buffer[2+29+1+1];
2959 char *cp;
2960 sprintf(buffer, "&#%d;", (int)p[collpos]);
2961 for (cp = buffer; *cp; ++cp) {
2962 x = charmapencode_output(*cp, mapping, res, respos);
2963 if (x==NULL)
2964 return -1;
2965 else if (x==Py_None) {
2966 Py_DECREF(x);
2967 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2968 return -1;
2969 }
2970 Py_DECREF(x);
2971 }
2972 }
2973 *inpos = collendpos;
2974 break;
2975 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002976 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002977 encoding, reason, p, size, exceptionObject,
2978 collstartpos, collendpos, &newpos);
2979 if (repunicode == NULL)
2980 return -1;
2981 /* generate replacement */
2982 repsize = PyUnicode_GET_SIZE(repunicode);
2983 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2984 x = charmapencode_output(*uni2, mapping, res, respos);
2985 if (x==NULL) {
2986 Py_DECREF(repunicode);
2987 return -1;
2988 }
2989 else if (x==Py_None) {
2990 Py_DECREF(repunicode);
2991 Py_DECREF(x);
2992 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2993 return -1;
2994 }
2995 Py_DECREF(x);
2996 }
2997 *inpos = newpos;
2998 Py_DECREF(repunicode);
2999 }
3000 return 0;
3001}
3002
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3004 int size,
3005 PyObject *mapping,
3006 const char *errors)
3007{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003008 /* output object */
3009 PyObject *res = NULL;
3010 /* current input position */
3011 int inpos = 0;
3012 /* current output position */
3013 int respos = 0;
3014 PyObject *errorHandler = NULL;
3015 PyObject *exc = NULL;
3016 /* the following variable is used for caching string comparisons
3017 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3018 * 3=ignore, 4=xmlcharrefreplace */
3019 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020
3021 /* Default to Latin-1 */
3022 if (mapping == NULL)
3023 return PyUnicode_EncodeLatin1(p, size, errors);
3024
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003025 /* allocate enough for a simple encoding without
3026 replacements, if we need more, we'll resize */
3027 res = PyString_FromStringAndSize(NULL, size);
3028 if (res == NULL)
3029 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003030 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003031 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003033 while (inpos<size) {
3034 /* try to encode it */
3035 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3036 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003038 if (x==Py_None) { /* unencodable character */
3039 if (charmap_encoding_error(p, size, &inpos, mapping,
3040 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003041 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003042 &res, &respos)) {
3043 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003044 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003047 else
3048 /* done with this character => adjust input position */
3049 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 Py_DECREF(x);
3051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 /* Resize if we allocated to much */
3054 if (respos<PyString_GET_SIZE(res)) {
3055 if (_PyString_Resize(&res, respos))
3056 goto onError;
3057 }
3058 Py_XDECREF(exc);
3059 Py_XDECREF(errorHandler);
3060 return res;
3061
3062 onError:
3063 Py_XDECREF(res);
3064 Py_XDECREF(exc);
3065 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003066 return NULL;
3067}
3068
3069PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3070 PyObject *mapping)
3071{
3072 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3073 PyErr_BadArgument();
3074 return NULL;
3075 }
3076 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3077 PyUnicode_GET_SIZE(unicode),
3078 mapping,
3079 NULL);
3080}
3081
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003082/* create or adjust a UnicodeTranslateError */
3083static void make_translate_exception(PyObject **exceptionObject,
3084 const Py_UNICODE *unicode, int size,
3085 int startpos, int endpos,
3086 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003088 if (*exceptionObject == NULL) {
3089 *exceptionObject = PyUnicodeTranslateError_Create(
3090 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 }
3092 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003093 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3094 goto onError;
3095 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3096 goto onError;
3097 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3098 goto onError;
3099 return;
3100 onError:
3101 Py_DECREF(*exceptionObject);
3102 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103 }
3104}
3105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003106/* raises a UnicodeTranslateError */
3107static void raise_translate_exception(PyObject **exceptionObject,
3108 const Py_UNICODE *unicode, int size,
3109 int startpos, int endpos,
3110 const char *reason)
3111{
3112 make_translate_exception(exceptionObject,
3113 unicode, size, startpos, endpos, reason);
3114 if (*exceptionObject != NULL)
3115 PyCodec_StrictErrors(*exceptionObject);
3116}
3117
3118/* error handling callback helper:
3119 build arguments, call the callback and check the arguments,
3120 put the result into newpos and return the replacement string, which
3121 has to be freed by the caller */
3122static PyObject *unicode_translate_call_errorhandler(const char *errors,
3123 PyObject **errorHandler,
3124 const char *reason,
3125 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3126 int startpos, int endpos,
3127 int *newpos)
3128{
3129 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3130
3131 PyObject *restuple;
3132 PyObject *resunicode;
3133
3134 if (*errorHandler == NULL) {
3135 *errorHandler = PyCodec_LookupError(errors);
3136 if (*errorHandler == NULL)
3137 return NULL;
3138 }
3139
3140 make_translate_exception(exceptionObject,
3141 unicode, size, startpos, endpos, reason);
3142 if (*exceptionObject == NULL)
3143 return NULL;
3144
3145 restuple = PyObject_CallFunctionObjArgs(
3146 *errorHandler, *exceptionObject, NULL);
3147 if (restuple == NULL)
3148 return NULL;
3149 if (!PyTuple_Check(restuple)) {
3150 PyErr_Format(PyExc_TypeError, &argparse[4]);
3151 Py_DECREF(restuple);
3152 return NULL;
3153 }
3154 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3155 &resunicode, newpos)) {
3156 Py_DECREF(restuple);
3157 return NULL;
3158 }
3159 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003160 *newpos = size+*newpos;
3161 if (*newpos<0 || *newpos>size) {
3162 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3163 Py_DECREF(restuple);
3164 return NULL;
3165 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003166 Py_INCREF(resunicode);
3167 Py_DECREF(restuple);
3168 return resunicode;
3169}
3170
3171/* Lookup the character ch in the mapping and put the result in result,
3172 which must be decrefed by the caller.
3173 Return 0 on success, -1 on error */
3174static
3175int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3176{
3177 PyObject *w = PyInt_FromLong((long)c);
3178 PyObject *x;
3179
3180 if (w == NULL)
3181 return -1;
3182 x = PyObject_GetItem(mapping, w);
3183 Py_DECREF(w);
3184 if (x == NULL) {
3185 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3186 /* No mapping found means: use 1:1 mapping. */
3187 PyErr_Clear();
3188 *result = NULL;
3189 return 0;
3190 } else
3191 return -1;
3192 }
3193 else if (x == Py_None) {
3194 *result = x;
3195 return 0;
3196 }
3197 else if (PyInt_Check(x)) {
3198 long value = PyInt_AS_LONG(x);
3199 long max = PyUnicode_GetMax();
3200 if (value < 0 || value > max) {
3201 PyErr_Format(PyExc_TypeError,
3202 "character mapping must be in range(0x%lx)", max+1);
3203 Py_DECREF(x);
3204 return -1;
3205 }
3206 *result = x;
3207 return 0;
3208 }
3209 else if (PyUnicode_Check(x)) {
3210 *result = x;
3211 return 0;
3212 }
3213 else {
3214 /* wrong return value */
3215 PyErr_SetString(PyExc_TypeError,
3216 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003217 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003218 return -1;
3219 }
3220}
3221/* ensure that *outobj is at least requiredsize characters long,
3222if not reallocate and adjust various state variables.
3223Return 0 on success, -1 on error */
3224static
3225int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize,
3226 int requiredsize)
3227{
3228 if (requiredsize > *outsize) {
3229 /* remember old output position */
3230 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3231 /* exponentially overallocate to minimize reallocations */
3232 if (requiredsize < 2 * *outsize)
3233 requiredsize = 2 * *outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003234 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003235 return -1;
3236 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3237 *outsize = requiredsize;
3238 }
3239 return 0;
3240}
3241/* lookup the character, put the result in the output string and adjust
3242 various state variables. Return a new reference to the object that
3243 was put in the output buffer in *result, or Py_None, if the mapping was
3244 undefined (in which case no character was written).
3245 The called must decref result.
3246 Return 0 on success, -1 on error. */
3247static
3248int charmaptranslate_output(Py_UNICODE c, PyObject *mapping,
3249 PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res)
3250{
3251 if (charmaptranslate_lookup(c, mapping, res))
3252 return -1;
3253 if (*res==NULL) {
3254 /* not found => default to 1:1 mapping */
3255 *(*outp)++ = (Py_UNICODE)c;
3256 }
3257 else if (*res==Py_None)
3258 ;
3259 else if (PyInt_Check(*res)) {
3260 /* no overflow check, because we know that the space is enough */
3261 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3262 }
3263 else if (PyUnicode_Check(*res)) {
3264 int repsize = PyUnicode_GET_SIZE(*res);
3265 if (repsize==1) {
3266 /* no overflow check, because we know that the space is enough */
3267 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3268 }
3269 else if (repsize!=0) {
3270 /* more than one character */
3271 int requiredsize = *outsize + repsize - 1;
3272 if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize))
3273 return -1;
3274 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3275 *outp += repsize;
3276 }
3277 }
3278 else
3279 return -1;
3280 return 0;
3281}
3282
3283PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003284 int size,
3285 PyObject *mapping,
3286 const char *errors)
3287{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003288 /* output object */
3289 PyObject *res = NULL;
3290 /* pointers to the beginning and end+1 of input */
3291 const Py_UNICODE *startp = p;
3292 const Py_UNICODE *endp = p + size;
3293 /* pointer into the output */
3294 Py_UNICODE *str;
3295 /* current output position */
3296 int respos = 0;
3297 int ressize;
3298 char *reason = "character maps to <undefined>";
3299 PyObject *errorHandler = NULL;
3300 PyObject *exc = NULL;
3301 /* the following variable is used for caching string comparisons
3302 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3303 * 3=ignore, 4=xmlcharrefreplace */
3304 int known_errorHandler = -1;
3305
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 if (mapping == NULL) {
3307 PyErr_BadArgument();
3308 return NULL;
3309 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003310
3311 /* allocate enough for a simple 1:1 translation without
3312 replacements, if we need more, we'll resize */
3313 res = PyUnicode_FromUnicode(NULL, size);
3314 if (res == NULL)
3315 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003317 return res;
3318 str = PyUnicode_AS_UNICODE(res);
3319 ressize = size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003321 while (p<endp) {
3322 /* try to encode it */
3323 PyObject *x = NULL;
3324 if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) {
3325 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326 goto onError;
3327 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003328 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003329 if (x!=Py_None) /* it worked => adjust input pointer */
3330 ++p;
3331 else { /* untranslatable character */
3332 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3333 int repsize;
3334 int newpos;
3335 Py_UNICODE *uni2;
3336 /* startpos for collecting untranslatable chars */
3337 const Py_UNICODE *collstart = p;
3338 const Py_UNICODE *collend = p+1;
3339 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003341 /* find all untranslatable characters */
3342 while (collend < endp) {
3343 if (charmaptranslate_lookup(*collend, mapping, &x))
3344 goto onError;
3345 Py_XDECREF(x);
3346 if (x!=Py_None)
3347 break;
3348 ++collend;
3349 }
3350 /* cache callback name lookup
3351 * (if not done yet, i.e. it's the first error) */
3352 if (known_errorHandler==-1) {
3353 if ((errors==NULL) || (!strcmp(errors, "strict")))
3354 known_errorHandler = 1;
3355 else if (!strcmp(errors, "replace"))
3356 known_errorHandler = 2;
3357 else if (!strcmp(errors, "ignore"))
3358 known_errorHandler = 3;
3359 else if (!strcmp(errors, "xmlcharrefreplace"))
3360 known_errorHandler = 4;
3361 else
3362 known_errorHandler = 0;
3363 }
3364 switch (known_errorHandler) {
3365 case 1: /* strict */
3366 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3367 goto onError;
3368 case 2: /* replace */
3369 /* No need to check for space, this is a 1:1 replacement */
3370 for (coll = collstart; coll<collend; ++coll)
3371 *str++ = '?';
3372 /* fall through */
3373 case 3: /* ignore */
3374 p = collend;
3375 break;
3376 case 4: /* xmlcharrefreplace */
3377 /* generate replacement (temporarily (mis)uses p) */
3378 for (p = collstart; p < collend; ++p) {
3379 char buffer[2+29+1+1];
3380 char *cp;
3381 sprintf(buffer, "&#%d;", (int)*p);
3382 if (charmaptranslate_makespace(&res, &str, &ressize,
3383 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3384 goto onError;
3385 for (cp = buffer; *cp; ++cp)
3386 *str++ = *cp;
3387 }
3388 p = collend;
3389 break;
3390 default:
3391 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3392 reason, startp, size, &exc,
3393 collstart-startp, collend-startp, &newpos);
3394 if (repunicode == NULL)
3395 goto onError;
3396 /* generate replacement */
3397 repsize = PyUnicode_GET_SIZE(repunicode);
3398 if (charmaptranslate_makespace(&res, &str, &ressize,
3399 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3400 Py_DECREF(repunicode);
3401 goto onError;
3402 }
3403 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3404 *str++ = *uni2;
3405 p = startp + newpos;
3406 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 }
3408 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003409 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003410 /* Resize if we allocated to much */
3411 respos = str-PyUnicode_AS_UNICODE(res);
3412 if (respos<ressize) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003413 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003414 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003415 }
3416 Py_XDECREF(exc);
3417 Py_XDECREF(errorHandler);
3418 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003420 onError:
3421 Py_XDECREF(res);
3422 Py_XDECREF(exc);
3423 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003424 return NULL;
3425}
3426
3427PyObject *PyUnicode_Translate(PyObject *str,
3428 PyObject *mapping,
3429 const char *errors)
3430{
3431 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003432
Guido van Rossumd57fd912000-03-10 22:53:23 +00003433 str = PyUnicode_FromObject(str);
3434 if (str == NULL)
3435 goto onError;
3436 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3437 PyUnicode_GET_SIZE(str),
3438 mapping,
3439 errors);
3440 Py_DECREF(str);
3441 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003442
Guido van Rossumd57fd912000-03-10 22:53:23 +00003443 onError:
3444 Py_XDECREF(str);
3445 return NULL;
3446}
Tim Petersced69f82003-09-16 20:30:58 +00003447
Guido van Rossum9e896b32000-04-05 20:11:21 +00003448/* --- Decimal Encoder ---------------------------------------------------- */
3449
3450int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3451 int length,
3452 char *output,
3453 const char *errors)
3454{
3455 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003456 PyObject *errorHandler = NULL;
3457 PyObject *exc = NULL;
3458 const char *encoding = "decimal";
3459 const char *reason = "invalid decimal Unicode string";
3460 /* the following variable is used for caching string comparisons
3461 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3462 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003463
3464 if (output == NULL) {
3465 PyErr_BadArgument();
3466 return -1;
3467 }
3468
3469 p = s;
3470 end = s + length;
3471 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003472 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003473 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003474 PyObject *repunicode;
3475 int repsize;
3476 int newpos;
3477 Py_UNICODE *uni2;
3478 Py_UNICODE *collstart;
3479 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003480
Guido van Rossum9e896b32000-04-05 20:11:21 +00003481 if (Py_UNICODE_ISSPACE(ch)) {
3482 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003483 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003484 continue;
3485 }
3486 decimal = Py_UNICODE_TODECIMAL(ch);
3487 if (decimal >= 0) {
3488 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003489 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003490 continue;
3491 }
Guido van Rossumba477042000-04-06 18:18:10 +00003492 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003493 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003494 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003495 continue;
3496 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497 /* All other characters are considered unencodable */
3498 collstart = p;
3499 collend = p+1;
3500 while (collend < end) {
3501 if ((0 < *collend && *collend < 256) ||
3502 !Py_UNICODE_ISSPACE(*collend) ||
3503 Py_UNICODE_TODECIMAL(*collend))
3504 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003505 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506 /* cache callback name lookup
3507 * (if not done yet, i.e. it's the first error) */
3508 if (known_errorHandler==-1) {
3509 if ((errors==NULL) || (!strcmp(errors, "strict")))
3510 known_errorHandler = 1;
3511 else if (!strcmp(errors, "replace"))
3512 known_errorHandler = 2;
3513 else if (!strcmp(errors, "ignore"))
3514 known_errorHandler = 3;
3515 else if (!strcmp(errors, "xmlcharrefreplace"))
3516 known_errorHandler = 4;
3517 else
3518 known_errorHandler = 0;
3519 }
3520 switch (known_errorHandler) {
3521 case 1: /* strict */
3522 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3523 goto onError;
3524 case 2: /* replace */
3525 for (p = collstart; p < collend; ++p)
3526 *output++ = '?';
3527 /* fall through */
3528 case 3: /* ignore */
3529 p = collend;
3530 break;
3531 case 4: /* xmlcharrefreplace */
3532 /* generate replacement (temporarily (mis)uses p) */
3533 for (p = collstart; p < collend; ++p)
3534 output += sprintf(output, "&#%d;", (int)*p);
3535 p = collend;
3536 break;
3537 default:
3538 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3539 encoding, reason, s, length, &exc,
3540 collstart-s, collend-s, &newpos);
3541 if (repunicode == NULL)
3542 goto onError;
3543 /* generate replacement */
3544 repsize = PyUnicode_GET_SIZE(repunicode);
3545 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3546 Py_UNICODE ch = *uni2;
3547 if (Py_UNICODE_ISSPACE(ch))
3548 *output++ = ' ';
3549 else {
3550 decimal = Py_UNICODE_TODECIMAL(ch);
3551 if (decimal >= 0)
3552 *output++ = '0' + decimal;
3553 else if (0 < ch && ch < 256)
3554 *output++ = (char)ch;
3555 else {
3556 Py_DECREF(repunicode);
3557 raise_encode_exception(&exc, encoding,
3558 s, length, collstart-s, collend-s, reason);
3559 goto onError;
3560 }
3561 }
3562 }
3563 p = s + newpos;
3564 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003565 }
3566 }
3567 /* 0-terminate the output string */
3568 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569 Py_XDECREF(exc);
3570 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003571 return 0;
3572
3573 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 Py_XDECREF(exc);
3575 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003576 return -1;
3577}
3578
Guido van Rossumd57fd912000-03-10 22:53:23 +00003579/* --- Helpers ------------------------------------------------------------ */
3580
Tim Petersced69f82003-09-16 20:30:58 +00003581static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003582int count(PyUnicodeObject *self,
3583 int start,
3584 int end,
3585 PyUnicodeObject *substring)
3586{
3587 int count = 0;
3588
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003589 if (start < 0)
3590 start += self->length;
3591 if (start < 0)
3592 start = 0;
3593 if (end > self->length)
3594 end = self->length;
3595 if (end < 0)
3596 end += self->length;
3597 if (end < 0)
3598 end = 0;
3599
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003600 if (substring->length == 0)
3601 return (end - start + 1);
3602
Guido van Rossumd57fd912000-03-10 22:53:23 +00003603 end -= substring->length;
3604
3605 while (start <= end)
3606 if (Py_UNICODE_MATCH(self, start, substring)) {
3607 count++;
3608 start += substring->length;
3609 } else
3610 start++;
3611
3612 return count;
3613}
3614
3615int PyUnicode_Count(PyObject *str,
3616 PyObject *substr,
3617 int start,
3618 int end)
3619{
3620 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003621
Guido van Rossumd57fd912000-03-10 22:53:23 +00003622 str = PyUnicode_FromObject(str);
3623 if (str == NULL)
3624 return -1;
3625 substr = PyUnicode_FromObject(substr);
3626 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003627 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628 return -1;
3629 }
Tim Petersced69f82003-09-16 20:30:58 +00003630
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631 result = count((PyUnicodeObject *)str,
3632 start, end,
3633 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003634
Guido van Rossumd57fd912000-03-10 22:53:23 +00003635 Py_DECREF(str);
3636 Py_DECREF(substr);
3637 return result;
3638}
3639
Tim Petersced69f82003-09-16 20:30:58 +00003640static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641int findstring(PyUnicodeObject *self,
3642 PyUnicodeObject *substring,
3643 int start,
3644 int end,
3645 int direction)
3646{
3647 if (start < 0)
3648 start += self->length;
3649 if (start < 0)
3650 start = 0;
3651
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 if (end > self->length)
3653 end = self->length;
3654 if (end < 0)
3655 end += self->length;
3656 if (end < 0)
3657 end = 0;
3658
Guido van Rossum76afbd92002-08-20 17:29:29 +00003659 if (substring->length == 0)
3660 return (direction > 0) ? start : end;
3661
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662 end -= substring->length;
3663
3664 if (direction < 0) {
3665 for (; end >= start; end--)
3666 if (Py_UNICODE_MATCH(self, end, substring))
3667 return end;
3668 } else {
3669 for (; start <= end; start++)
3670 if (Py_UNICODE_MATCH(self, start, substring))
3671 return start;
3672 }
3673
3674 return -1;
3675}
3676
3677int PyUnicode_Find(PyObject *str,
3678 PyObject *substr,
3679 int start,
3680 int end,
3681 int direction)
3682{
3683 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003684
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685 str = PyUnicode_FromObject(str);
3686 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003687 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688 substr = PyUnicode_FromObject(substr);
3689 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003690 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003691 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692 }
Tim Petersced69f82003-09-16 20:30:58 +00003693
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694 result = findstring((PyUnicodeObject *)str,
3695 (PyUnicodeObject *)substr,
3696 start, end, direction);
3697 Py_DECREF(str);
3698 Py_DECREF(substr);
3699 return result;
3700}
3701
Tim Petersced69f82003-09-16 20:30:58 +00003702static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703int tailmatch(PyUnicodeObject *self,
3704 PyUnicodeObject *substring,
3705 int start,
3706 int end,
3707 int direction)
3708{
3709 if (start < 0)
3710 start += self->length;
3711 if (start < 0)
3712 start = 0;
3713
3714 if (substring->length == 0)
3715 return 1;
3716
3717 if (end > self->length)
3718 end = self->length;
3719 if (end < 0)
3720 end += self->length;
3721 if (end < 0)
3722 end = 0;
3723
3724 end -= substring->length;
3725 if (end < start)
3726 return 0;
3727
3728 if (direction > 0) {
3729 if (Py_UNICODE_MATCH(self, end, substring))
3730 return 1;
3731 } else {
3732 if (Py_UNICODE_MATCH(self, start, substring))
3733 return 1;
3734 }
3735
3736 return 0;
3737}
3738
3739int PyUnicode_Tailmatch(PyObject *str,
3740 PyObject *substr,
3741 int start,
3742 int end,
3743 int direction)
3744{
3745 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003746
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747 str = PyUnicode_FromObject(str);
3748 if (str == NULL)
3749 return -1;
3750 substr = PyUnicode_FromObject(substr);
3751 if (substr == NULL) {
3752 Py_DECREF(substr);
3753 return -1;
3754 }
Tim Petersced69f82003-09-16 20:30:58 +00003755
Guido van Rossumd57fd912000-03-10 22:53:23 +00003756 result = tailmatch((PyUnicodeObject *)str,
3757 (PyUnicodeObject *)substr,
3758 start, end, direction);
3759 Py_DECREF(str);
3760 Py_DECREF(substr);
3761 return result;
3762}
3763
Tim Petersced69f82003-09-16 20:30:58 +00003764static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765const Py_UNICODE *findchar(const Py_UNICODE *s,
3766 int size,
3767 Py_UNICODE ch)
3768{
3769 /* like wcschr, but doesn't stop at NULL characters */
3770
3771 while (size-- > 0) {
3772 if (*s == ch)
3773 return s;
3774 s++;
3775 }
3776
3777 return NULL;
3778}
3779
3780/* Apply fixfct filter to the Unicode object self and return a
3781 reference to the modified object */
3782
Tim Petersced69f82003-09-16 20:30:58 +00003783static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784PyObject *fixup(PyUnicodeObject *self,
3785 int (*fixfct)(PyUnicodeObject *s))
3786{
3787
3788 PyUnicodeObject *u;
3789
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003790 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003791 if (u == NULL)
3792 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003793
3794 Py_UNICODE_COPY(u->str, self->str, self->length);
3795
Tim Peters7a29bd52001-09-12 03:03:31 +00003796 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797 /* fixfct should return TRUE if it modified the buffer. If
3798 FALSE, return a reference to the original buffer instead
3799 (to save space, not time) */
3800 Py_INCREF(self);
3801 Py_DECREF(u);
3802 return (PyObject*) self;
3803 }
3804 return (PyObject*) u;
3805}
3806
Tim Petersced69f82003-09-16 20:30:58 +00003807static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808int fixupper(PyUnicodeObject *self)
3809{
3810 int len = self->length;
3811 Py_UNICODE *s = self->str;
3812 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003813
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 while (len-- > 0) {
3815 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003816
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 ch = Py_UNICODE_TOUPPER(*s);
3818 if (ch != *s) {
3819 status = 1;
3820 *s = ch;
3821 }
3822 s++;
3823 }
3824
3825 return status;
3826}
3827
Tim Petersced69f82003-09-16 20:30:58 +00003828static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829int fixlower(PyUnicodeObject *self)
3830{
3831 int len = self->length;
3832 Py_UNICODE *s = self->str;
3833 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003834
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835 while (len-- > 0) {
3836 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003837
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 ch = Py_UNICODE_TOLOWER(*s);
3839 if (ch != *s) {
3840 status = 1;
3841 *s = ch;
3842 }
3843 s++;
3844 }
3845
3846 return status;
3847}
3848
Tim Petersced69f82003-09-16 20:30:58 +00003849static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003850int fixswapcase(PyUnicodeObject *self)
3851{
3852 int len = self->length;
3853 Py_UNICODE *s = self->str;
3854 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003855
Guido van Rossumd57fd912000-03-10 22:53:23 +00003856 while (len-- > 0) {
3857 if (Py_UNICODE_ISUPPER(*s)) {
3858 *s = Py_UNICODE_TOLOWER(*s);
3859 status = 1;
3860 } else if (Py_UNICODE_ISLOWER(*s)) {
3861 *s = Py_UNICODE_TOUPPER(*s);
3862 status = 1;
3863 }
3864 s++;
3865 }
3866
3867 return status;
3868}
3869
Tim Petersced69f82003-09-16 20:30:58 +00003870static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871int fixcapitalize(PyUnicodeObject *self)
3872{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003873 int len = self->length;
3874 Py_UNICODE *s = self->str;
3875 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003876
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003877 if (len == 0)
3878 return 0;
3879 if (Py_UNICODE_ISLOWER(*s)) {
3880 *s = Py_UNICODE_TOUPPER(*s);
3881 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003882 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003883 s++;
3884 while (--len > 0) {
3885 if (Py_UNICODE_ISUPPER(*s)) {
3886 *s = Py_UNICODE_TOLOWER(*s);
3887 status = 1;
3888 }
3889 s++;
3890 }
3891 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003892}
3893
3894static
3895int fixtitle(PyUnicodeObject *self)
3896{
3897 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3898 register Py_UNICODE *e;
3899 int previous_is_cased;
3900
3901 /* Shortcut for single character strings */
3902 if (PyUnicode_GET_SIZE(self) == 1) {
3903 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3904 if (*p != ch) {
3905 *p = ch;
3906 return 1;
3907 }
3908 else
3909 return 0;
3910 }
Tim Petersced69f82003-09-16 20:30:58 +00003911
Guido van Rossumd57fd912000-03-10 22:53:23 +00003912 e = p + PyUnicode_GET_SIZE(self);
3913 previous_is_cased = 0;
3914 for (; p < e; p++) {
3915 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00003916
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917 if (previous_is_cased)
3918 *p = Py_UNICODE_TOLOWER(ch);
3919 else
3920 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00003921
3922 if (Py_UNICODE_ISLOWER(ch) ||
3923 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00003924 Py_UNICODE_ISTITLE(ch))
3925 previous_is_cased = 1;
3926 else
3927 previous_is_cased = 0;
3928 }
3929 return 1;
3930}
3931
3932PyObject *PyUnicode_Join(PyObject *separator,
3933 PyObject *seq)
3934{
3935 Py_UNICODE *sep;
3936 int seplen;
3937 PyUnicodeObject *res = NULL;
3938 int reslen = 0;
3939 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940 int sz = 100;
3941 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003942 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943
Tim Peters2cfe3682001-05-05 05:36:48 +00003944 it = PyObject_GetIter(seq);
3945 if (it == NULL)
3946 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947
3948 if (separator == NULL) {
3949 Py_UNICODE blank = ' ';
3950 sep = &blank;
3951 seplen = 1;
3952 }
3953 else {
3954 separator = PyUnicode_FromObject(separator);
3955 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003956 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 sep = PyUnicode_AS_UNICODE(separator);
3958 seplen = PyUnicode_GET_SIZE(separator);
3959 }
Tim Petersced69f82003-09-16 20:30:58 +00003960
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 res = _PyUnicode_New(sz);
3962 if (res == NULL)
3963 goto onError;
3964 p = PyUnicode_AS_UNICODE(res);
3965 reslen = 0;
3966
Tim Peters2cfe3682001-05-05 05:36:48 +00003967 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003969 PyObject *item = PyIter_Next(it);
3970 if (item == NULL) {
3971 if (PyErr_Occurred())
3972 goto onError;
3973 break;
3974 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 if (!PyUnicode_Check(item)) {
3976 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003977 if (!PyString_Check(item)) {
3978 PyErr_Format(PyExc_TypeError,
3979 "sequence item %i: expected string or Unicode,"
3980 " %.80s found",
3981 i, item->ob_type->tp_name);
3982 Py_DECREF(item);
3983 goto onError;
3984 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985 v = PyUnicode_FromObject(item);
3986 Py_DECREF(item);
3987 item = v;
3988 if (item == NULL)
3989 goto onError;
3990 }
3991 itemlen = PyUnicode_GET_SIZE(item);
3992 while (reslen + itemlen + seplen >= sz) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003993 if (_PyUnicode_Resize(&res, sz*2) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003994 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997 sz *= 2;
3998 p = PyUnicode_AS_UNICODE(res) + reslen;
3999 }
4000 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004001 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004002 p += seplen;
4003 reslen += seplen;
4004 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004005 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004006 p += itemlen;
4007 reslen += itemlen;
4008 Py_DECREF(item);
4009 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004010 if (_PyUnicode_Resize(&res, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011 goto onError;
4012
4013 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004014 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015 return (PyObject *)res;
4016
4017 onError:
4018 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004019 Py_XDECREF(res);
4020 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021 return NULL;
4022}
4023
Tim Petersced69f82003-09-16 20:30:58 +00004024static
4025PyUnicodeObject *pad(PyUnicodeObject *self,
4026 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027 int right,
4028 Py_UNICODE fill)
4029{
4030 PyUnicodeObject *u;
4031
4032 if (left < 0)
4033 left = 0;
4034 if (right < 0)
4035 right = 0;
4036
Tim Peters7a29bd52001-09-12 03:03:31 +00004037 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038 Py_INCREF(self);
4039 return self;
4040 }
4041
4042 u = _PyUnicode_New(left + self->length + right);
4043 if (u) {
4044 if (left)
4045 Py_UNICODE_FILL(u->str, fill, left);
4046 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4047 if (right)
4048 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4049 }
4050
4051 return u;
4052}
4053
4054#define SPLIT_APPEND(data, left, right) \
4055 str = PyUnicode_FromUnicode(data + left, right - left); \
4056 if (!str) \
4057 goto onError; \
4058 if (PyList_Append(list, str)) { \
4059 Py_DECREF(str); \
4060 goto onError; \
4061 } \
4062 else \
4063 Py_DECREF(str);
4064
4065static
4066PyObject *split_whitespace(PyUnicodeObject *self,
4067 PyObject *list,
4068 int maxcount)
4069{
4070 register int i;
4071 register int j;
4072 int len = self->length;
4073 PyObject *str;
4074
4075 for (i = j = 0; i < len; ) {
4076 /* find a token */
4077 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4078 i++;
4079 j = i;
4080 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4081 i++;
4082 if (j < i) {
4083 if (maxcount-- <= 0)
4084 break;
4085 SPLIT_APPEND(self->str, j, i);
4086 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4087 i++;
4088 j = i;
4089 }
4090 }
4091 if (j < len) {
4092 SPLIT_APPEND(self->str, j, len);
4093 }
4094 return list;
4095
4096 onError:
4097 Py_DECREF(list);
4098 return NULL;
4099}
4100
4101PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004102 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103{
4104 register int i;
4105 register int j;
4106 int len;
4107 PyObject *list;
4108 PyObject *str;
4109 Py_UNICODE *data;
4110
4111 string = PyUnicode_FromObject(string);
4112 if (string == NULL)
4113 return NULL;
4114 data = PyUnicode_AS_UNICODE(string);
4115 len = PyUnicode_GET_SIZE(string);
4116
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117 list = PyList_New(0);
4118 if (!list)
4119 goto onError;
4120
4121 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004122 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004123
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124 /* Find a line and append it */
4125 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4126 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127
4128 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004129 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130 if (i < len) {
4131 if (data[i] == '\r' && i + 1 < len &&
4132 data[i+1] == '\n')
4133 i += 2;
4134 else
4135 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004136 if (keepends)
4137 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 }
Guido van Rossum86662912000-04-11 15:38:46 +00004139 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 j = i;
4141 }
4142 if (j < len) {
4143 SPLIT_APPEND(data, j, len);
4144 }
4145
4146 Py_DECREF(string);
4147 return list;
4148
4149 onError:
4150 Py_DECREF(list);
4151 Py_DECREF(string);
4152 return NULL;
4153}
4154
Tim Petersced69f82003-09-16 20:30:58 +00004155static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156PyObject *split_char(PyUnicodeObject *self,
4157 PyObject *list,
4158 Py_UNICODE ch,
4159 int maxcount)
4160{
4161 register int i;
4162 register int j;
4163 int len = self->length;
4164 PyObject *str;
4165
4166 for (i = j = 0; i < len; ) {
4167 if (self->str[i] == ch) {
4168 if (maxcount-- <= 0)
4169 break;
4170 SPLIT_APPEND(self->str, j, i);
4171 i = j = i + 1;
4172 } else
4173 i++;
4174 }
4175 if (j <= len) {
4176 SPLIT_APPEND(self->str, j, len);
4177 }
4178 return list;
4179
4180 onError:
4181 Py_DECREF(list);
4182 return NULL;
4183}
4184
Tim Petersced69f82003-09-16 20:30:58 +00004185static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186PyObject *split_substring(PyUnicodeObject *self,
4187 PyObject *list,
4188 PyUnicodeObject *substring,
4189 int maxcount)
4190{
4191 register int i;
4192 register int j;
4193 int len = self->length;
4194 int sublen = substring->length;
4195 PyObject *str;
4196
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004197 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198 if (Py_UNICODE_MATCH(self, i, substring)) {
4199 if (maxcount-- <= 0)
4200 break;
4201 SPLIT_APPEND(self->str, j, i);
4202 i = j = i + sublen;
4203 } else
4204 i++;
4205 }
4206 if (j <= len) {
4207 SPLIT_APPEND(self->str, j, len);
4208 }
4209 return list;
4210
4211 onError:
4212 Py_DECREF(list);
4213 return NULL;
4214}
4215
4216#undef SPLIT_APPEND
4217
4218static
4219PyObject *split(PyUnicodeObject *self,
4220 PyUnicodeObject *substring,
4221 int maxcount)
4222{
4223 PyObject *list;
4224
4225 if (maxcount < 0)
4226 maxcount = INT_MAX;
4227
4228 list = PyList_New(0);
4229 if (!list)
4230 return NULL;
4231
4232 if (substring == NULL)
4233 return split_whitespace(self,list,maxcount);
4234
4235 else if (substring->length == 1)
4236 return split_char(self,list,substring->str[0],maxcount);
4237
4238 else if (substring->length == 0) {
4239 Py_DECREF(list);
4240 PyErr_SetString(PyExc_ValueError, "empty separator");
4241 return NULL;
4242 }
4243 else
4244 return split_substring(self,list,substring,maxcount);
4245}
4246
Tim Petersced69f82003-09-16 20:30:58 +00004247static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004248PyObject *replace(PyUnicodeObject *self,
4249 PyUnicodeObject *str1,
4250 PyUnicodeObject *str2,
4251 int maxcount)
4252{
4253 PyUnicodeObject *u;
4254
4255 if (maxcount < 0)
4256 maxcount = INT_MAX;
4257
4258 if (str1->length == 1 && str2->length == 1) {
4259 int i;
4260
4261 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004262 if (!findchar(self->str, self->length, str1->str[0]) &&
4263 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004264 /* nothing to replace, return original string */
4265 Py_INCREF(self);
4266 u = self;
4267 } else {
4268 Py_UNICODE u1 = str1->str[0];
4269 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004270
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004272 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273 self->length
4274 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004275 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004276 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004277 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004278 for (i = 0; i < u->length; i++)
4279 if (u->str[i] == u1) {
4280 if (--maxcount < 0)
4281 break;
4282 u->str[i] = u2;
4283 }
4284 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004285 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286
4287 } else {
4288 int n, i;
4289 Py_UNICODE *p;
4290
4291 /* replace strings */
4292 n = count(self, 0, self->length, str1);
4293 if (n > maxcount)
4294 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004295 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004296 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004297 if (PyUnicode_CheckExact(self)) {
4298 Py_INCREF(self);
4299 u = self;
4300 }
4301 else {
4302 u = (PyUnicodeObject *)
4303 PyUnicode_FromUnicode(self->str, self->length);
4304 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305 } else {
4306 u = _PyUnicode_New(
4307 self->length + n * (str2->length - str1->length));
4308 if (u) {
4309 i = 0;
4310 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004311 if (str1->length > 0) {
4312 while (i <= self->length - str1->length)
4313 if (Py_UNICODE_MATCH(self, i, str1)) {
4314 /* replace string segment */
4315 Py_UNICODE_COPY(p, str2->str, str2->length);
4316 p += str2->length;
4317 i += str1->length;
4318 if (--n <= 0) {
4319 /* copy remaining part */
4320 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4321 break;
4322 }
4323 } else
4324 *p++ = self->str[i++];
4325 } else {
4326 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327 Py_UNICODE_COPY(p, str2->str, str2->length);
4328 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004329 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004332 }
4333 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4334 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004335 }
4336 }
4337 }
Tim Petersced69f82003-09-16 20:30:58 +00004338
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339 return (PyObject *) u;
4340}
4341
4342/* --- Unicode Object Methods --------------------------------------------- */
4343
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004344PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345"S.title() -> unicode\n\
4346\n\
4347Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004348characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349
4350static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004351unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004352{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353 return fixup(self, fixtitle);
4354}
4355
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004356PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357"S.capitalize() -> unicode\n\
4358\n\
4359Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004360have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361
4362static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004363unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365 return fixup(self, fixcapitalize);
4366}
4367
4368#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004369PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370"S.capwords() -> unicode\n\
4371\n\
4372Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004373normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004374
4375static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004376unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004377{
4378 PyObject *list;
4379 PyObject *item;
4380 int i;
4381
Guido van Rossumd57fd912000-03-10 22:53:23 +00004382 /* Split into words */
4383 list = split(self, NULL, -1);
4384 if (!list)
4385 return NULL;
4386
4387 /* Capitalize each word */
4388 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4389 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4390 fixcapitalize);
4391 if (item == NULL)
4392 goto onError;
4393 Py_DECREF(PyList_GET_ITEM(list, i));
4394 PyList_SET_ITEM(list, i, item);
4395 }
4396
4397 /* Join the words to form a new string */
4398 item = PyUnicode_Join(NULL, list);
4399
4400onError:
4401 Py_DECREF(list);
4402 return (PyObject *)item;
4403}
4404#endif
4405
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004406PyDoc_STRVAR(center__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407"S.center(width) -> unicode\n\
4408\n\
4409Return S centered in a Unicode string of length width. Padding is done\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004410using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411
4412static PyObject *
4413unicode_center(PyUnicodeObject *self, PyObject *args)
4414{
4415 int marg, left;
4416 int width;
4417
4418 if (!PyArg_ParseTuple(args, "i:center", &width))
4419 return NULL;
4420
Tim Peters7a29bd52001-09-12 03:03:31 +00004421 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422 Py_INCREF(self);
4423 return (PyObject*) self;
4424 }
4425
4426 marg = width - self->length;
4427 left = marg / 2 + (marg & width & 1);
4428
4429 return (PyObject*) pad(self, left, marg - left, ' ');
4430}
4431
Marc-André Lemburge5034372000-08-08 08:04:29 +00004432#if 0
4433
4434/* This code should go into some future Unicode collation support
4435 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004436 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004437
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004438/* speedy UTF-16 code point order comparison */
4439/* gleaned from: */
4440/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4441
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004442static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004443{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004444 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004445 0, 0, 0, 0, 0, 0, 0, 0,
4446 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004447 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004448};
4449
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450static int
4451unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4452{
4453 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004454
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 Py_UNICODE *s1 = str1->str;
4456 Py_UNICODE *s2 = str2->str;
4457
4458 len1 = str1->length;
4459 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004460
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004462 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004463
4464 c1 = *s1++;
4465 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004466
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004467 if (c1 > (1<<11) * 26)
4468 c1 += utf16Fixup[c1>>11];
4469 if (c2 > (1<<11) * 26)
4470 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004471 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004472
4473 if (c1 != c2)
4474 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004475
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004476 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 }
4478
4479 return (len1 < len2) ? -1 : (len1 != len2);
4480}
4481
Marc-André Lemburge5034372000-08-08 08:04:29 +00004482#else
4483
4484static int
4485unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4486{
4487 register int len1, len2;
4488
4489 Py_UNICODE *s1 = str1->str;
4490 Py_UNICODE *s2 = str2->str;
4491
4492 len1 = str1->length;
4493 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004494
Marc-André Lemburge5034372000-08-08 08:04:29 +00004495 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004496 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004497
Fredrik Lundh45714e92001-06-26 16:39:36 +00004498 c1 = *s1++;
4499 c2 = *s2++;
4500
4501 if (c1 != c2)
4502 return (c1 < c2) ? -1 : 1;
4503
Marc-André Lemburge5034372000-08-08 08:04:29 +00004504 len1--; len2--;
4505 }
4506
4507 return (len1 < len2) ? -1 : (len1 != len2);
4508}
4509
4510#endif
4511
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512int PyUnicode_Compare(PyObject *left,
4513 PyObject *right)
4514{
4515 PyUnicodeObject *u = NULL, *v = NULL;
4516 int result;
4517
4518 /* Coerce the two arguments */
4519 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4520 if (u == NULL)
4521 goto onError;
4522 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4523 if (v == NULL)
4524 goto onError;
4525
Thomas Wouters7e474022000-07-16 12:04:32 +00004526 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004527 if (v == u) {
4528 Py_DECREF(u);
4529 Py_DECREF(v);
4530 return 0;
4531 }
4532
4533 result = unicode_compare(u, v);
4534
4535 Py_DECREF(u);
4536 Py_DECREF(v);
4537 return result;
4538
4539onError:
4540 Py_XDECREF(u);
4541 Py_XDECREF(v);
4542 return -1;
4543}
4544
Guido van Rossum403d68b2000-03-13 15:55:09 +00004545int PyUnicode_Contains(PyObject *container,
4546 PyObject *element)
4547{
4548 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004549 int result, size;
4550 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004551
4552 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004553 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004554 if (v == NULL) {
4555 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004556 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004557 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004558 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004559 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004560 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004561 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004562
Barry Warsaw817918c2002-08-06 16:58:21 +00004563 size = PyUnicode_GET_SIZE(v);
4564 rhs = PyUnicode_AS_UNICODE(v);
4565 lhs = PyUnicode_AS_UNICODE(u);
4566
Guido van Rossum403d68b2000-03-13 15:55:09 +00004567 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004568 if (size == 1) {
4569 end = lhs + PyUnicode_GET_SIZE(u);
4570 while (lhs < end) {
4571 if (*lhs++ == *rhs) {
4572 result = 1;
4573 break;
4574 }
4575 }
4576 }
4577 else {
4578 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4579 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004580 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004581 result = 1;
4582 break;
4583 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004584 }
4585 }
4586
4587 Py_DECREF(u);
4588 Py_DECREF(v);
4589 return result;
4590
4591onError:
4592 Py_XDECREF(u);
4593 Py_XDECREF(v);
4594 return -1;
4595}
4596
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597/* Concat to string or Unicode object giving a new Unicode object. */
4598
4599PyObject *PyUnicode_Concat(PyObject *left,
4600 PyObject *right)
4601{
4602 PyUnicodeObject *u = NULL, *v = NULL, *w;
4603
4604 /* Coerce the two arguments */
4605 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4606 if (u == NULL)
4607 goto onError;
4608 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4609 if (v == NULL)
4610 goto onError;
4611
4612 /* Shortcuts */
4613 if (v == unicode_empty) {
4614 Py_DECREF(v);
4615 return (PyObject *)u;
4616 }
4617 if (u == unicode_empty) {
4618 Py_DECREF(u);
4619 return (PyObject *)v;
4620 }
4621
4622 /* Concat the two Unicode strings */
4623 w = _PyUnicode_New(u->length + v->length);
4624 if (w == NULL)
4625 goto onError;
4626 Py_UNICODE_COPY(w->str, u->str, u->length);
4627 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4628
4629 Py_DECREF(u);
4630 Py_DECREF(v);
4631 return (PyObject *)w;
4632
4633onError:
4634 Py_XDECREF(u);
4635 Py_XDECREF(v);
4636 return NULL;
4637}
4638
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004639PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004640"S.count(sub[, start[, end]]) -> int\n\
4641\n\
4642Return the number of occurrences of substring sub in Unicode string\n\
4643S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004644interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645
4646static PyObject *
4647unicode_count(PyUnicodeObject *self, PyObject *args)
4648{
4649 PyUnicodeObject *substring;
4650 int start = 0;
4651 int end = INT_MAX;
4652 PyObject *result;
4653
Guido van Rossumb8872e62000-05-09 14:14:27 +00004654 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4655 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004656 return NULL;
4657
4658 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4659 (PyObject *)substring);
4660 if (substring == NULL)
4661 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004662
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663 if (start < 0)
4664 start += self->length;
4665 if (start < 0)
4666 start = 0;
4667 if (end > self->length)
4668 end = self->length;
4669 if (end < 0)
4670 end += self->length;
4671 if (end < 0)
4672 end = 0;
4673
4674 result = PyInt_FromLong((long) count(self, start, end, substring));
4675
4676 Py_DECREF(substring);
4677 return result;
4678}
4679
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004680PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004681"S.encode([encoding[,errors]]) -> string\n\
4682\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004683Return an encoded string version of S. Default encoding is the current\n\
4684default string encoding. errors may be given to set a different error\n\
4685handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004686a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4687'xmlcharrefreplace' as well as any other name registered with\n\
4688codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689
4690static PyObject *
4691unicode_encode(PyUnicodeObject *self, PyObject *args)
4692{
4693 char *encoding = NULL;
4694 char *errors = NULL;
4695 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4696 return NULL;
4697 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4698}
4699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004700PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701"S.expandtabs([tabsize]) -> unicode\n\
4702\n\
4703Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004704If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705
4706static PyObject*
4707unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4708{
4709 Py_UNICODE *e;
4710 Py_UNICODE *p;
4711 Py_UNICODE *q;
4712 int i, j;
4713 PyUnicodeObject *u;
4714 int tabsize = 8;
4715
4716 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4717 return NULL;
4718
Thomas Wouters7e474022000-07-16 12:04:32 +00004719 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720 i = j = 0;
4721 e = self->str + self->length;
4722 for (p = self->str; p < e; p++)
4723 if (*p == '\t') {
4724 if (tabsize > 0)
4725 j += tabsize - (j % tabsize);
4726 }
4727 else {
4728 j++;
4729 if (*p == '\n' || *p == '\r') {
4730 i += j;
4731 j = 0;
4732 }
4733 }
4734
4735 /* Second pass: create output string and fill it */
4736 u = _PyUnicode_New(i + j);
4737 if (!u)
4738 return NULL;
4739
4740 j = 0;
4741 q = u->str;
4742
4743 for (p = self->str; p < e; p++)
4744 if (*p == '\t') {
4745 if (tabsize > 0) {
4746 i = tabsize - (j % tabsize);
4747 j += i;
4748 while (i--)
4749 *q++ = ' ';
4750 }
4751 }
4752 else {
4753 j++;
4754 *q++ = *p;
4755 if (*p == '\n' || *p == '\r')
4756 j = 0;
4757 }
4758
4759 return (PyObject*) u;
4760}
4761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004762PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763"S.find(sub [,start [,end]]) -> int\n\
4764\n\
4765Return the lowest index in S where substring sub is found,\n\
4766such that sub is contained within s[start,end]. Optional\n\
4767arguments start and end are interpreted as in slice notation.\n\
4768\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004769Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770
4771static PyObject *
4772unicode_find(PyUnicodeObject *self, PyObject *args)
4773{
4774 PyUnicodeObject *substring;
4775 int start = 0;
4776 int end = INT_MAX;
4777 PyObject *result;
4778
Guido van Rossumb8872e62000-05-09 14:14:27 +00004779 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4780 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781 return NULL;
4782 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4783 (PyObject *)substring);
4784 if (substring == NULL)
4785 return NULL;
4786
4787 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4788
4789 Py_DECREF(substring);
4790 return result;
4791}
4792
4793static PyObject *
4794unicode_getitem(PyUnicodeObject *self, int index)
4795{
4796 if (index < 0 || index >= self->length) {
4797 PyErr_SetString(PyExc_IndexError, "string index out of range");
4798 return NULL;
4799 }
4800
4801 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4802}
4803
4804static long
4805unicode_hash(PyUnicodeObject *self)
4806{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004807 /* Since Unicode objects compare equal to their ASCII string
4808 counterparts, they should use the individual character values
4809 as basis for their hash value. This is needed to assure that
4810 strings and Unicode objects behave in the same way as
4811 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812
Fredrik Lundhdde61642000-07-10 18:27:47 +00004813 register int len;
4814 register Py_UNICODE *p;
4815 register long x;
4816
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 if (self->hash != -1)
4818 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004819 len = PyUnicode_GET_SIZE(self);
4820 p = PyUnicode_AS_UNICODE(self);
4821 x = *p << 7;
4822 while (--len >= 0)
4823 x = (1000003*x) ^ *p++;
4824 x ^= PyUnicode_GET_SIZE(self);
4825 if (x == -1)
4826 x = -2;
4827 self->hash = x;
4828 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829}
4830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004831PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832"S.index(sub [,start [,end]]) -> int\n\
4833\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004834Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835
4836static PyObject *
4837unicode_index(PyUnicodeObject *self, PyObject *args)
4838{
4839 int result;
4840 PyUnicodeObject *substring;
4841 int start = 0;
4842 int end = INT_MAX;
4843
Guido van Rossumb8872e62000-05-09 14:14:27 +00004844 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4845 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004847
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4849 (PyObject *)substring);
4850 if (substring == NULL)
4851 return NULL;
4852
4853 result = findstring(self, substring, start, end, 1);
4854
4855 Py_DECREF(substring);
4856 if (result < 0) {
4857 PyErr_SetString(PyExc_ValueError, "substring not found");
4858 return NULL;
4859 }
4860 return PyInt_FromLong(result);
4861}
4862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004863PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004864"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004866Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004867at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868
4869static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004870unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871{
4872 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4873 register const Py_UNICODE *e;
4874 int cased;
4875
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 /* Shortcut for single character strings */
4877 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004878 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004880 /* Special case for empty strings */
4881 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004882 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004883
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884 e = p + PyUnicode_GET_SIZE(self);
4885 cased = 0;
4886 for (; p < e; p++) {
4887 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004888
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004890 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 else if (!cased && Py_UNICODE_ISLOWER(ch))
4892 cased = 1;
4893 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004894 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895}
4896
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004897PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004898"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00004900Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004901at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902
4903static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004904unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905{
4906 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4907 register const Py_UNICODE *e;
4908 int cased;
4909
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910 /* Shortcut for single character strings */
4911 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004912 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004914 /* Special case for empty strings */
4915 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004916 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004917
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918 e = p + PyUnicode_GET_SIZE(self);
4919 cased = 0;
4920 for (; p < e; p++) {
4921 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004922
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004924 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925 else if (!cased && Py_UNICODE_ISUPPER(ch))
4926 cased = 1;
4927 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004928 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929}
4930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004931PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004932"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00004934Return True if S is a titlecased string and there is at least one\n\
4935character in S, i.e. upper- and titlecase characters may only\n\
4936follow uncased characters and lowercase characters only cased ones.\n\
4937Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938
4939static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004940unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941{
4942 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4943 register const Py_UNICODE *e;
4944 int cased, previous_is_cased;
4945
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946 /* Shortcut for single character strings */
4947 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004948 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4949 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004951 /* Special case for empty strings */
4952 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004953 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004954
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955 e = p + PyUnicode_GET_SIZE(self);
4956 cased = 0;
4957 previous_is_cased = 0;
4958 for (; p < e; p++) {
4959 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004960
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4962 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004963 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964 previous_is_cased = 1;
4965 cased = 1;
4966 }
4967 else if (Py_UNICODE_ISLOWER(ch)) {
4968 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004969 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004970 previous_is_cased = 1;
4971 cased = 1;
4972 }
4973 else
4974 previous_is_cased = 0;
4975 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004976 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977}
4978
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004979PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004980"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004981\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00004982Return True if all characters in S are whitespace\n\
4983and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984
4985static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004986unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987{
4988 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4989 register const Py_UNICODE *e;
4990
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991 /* Shortcut for single character strings */
4992 if (PyUnicode_GET_SIZE(self) == 1 &&
4993 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004994 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004995
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004996 /* Special case for empty strings */
4997 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004998 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004999
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000 e = p + PyUnicode_GET_SIZE(self);
5001 for (; p < e; p++) {
5002 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005003 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005004 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005005 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006}
5007
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005008PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005009"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005010\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005011Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005012and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005013
5014static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005015unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005016{
5017 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5018 register const Py_UNICODE *e;
5019
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005020 /* Shortcut for single character strings */
5021 if (PyUnicode_GET_SIZE(self) == 1 &&
5022 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005023 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005024
5025 /* Special case for empty strings */
5026 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005027 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005028
5029 e = p + PyUnicode_GET_SIZE(self);
5030 for (; p < e; p++) {
5031 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005032 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005033 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005034 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005035}
5036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005037PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005038"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005039\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005040Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005041and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005042
5043static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005044unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005045{
5046 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5047 register const Py_UNICODE *e;
5048
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005049 /* Shortcut for single character strings */
5050 if (PyUnicode_GET_SIZE(self) == 1 &&
5051 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005052 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005053
5054 /* Special case for empty strings */
5055 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005056 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005057
5058 e = p + PyUnicode_GET_SIZE(self);
5059 for (; p < e; p++) {
5060 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005061 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005062 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005063 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005064}
5065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005066PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005067"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005069Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005070False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071
5072static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005073unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074{
5075 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5076 register const Py_UNICODE *e;
5077
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078 /* Shortcut for single character strings */
5079 if (PyUnicode_GET_SIZE(self) == 1 &&
5080 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005081 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005083 /* Special case for empty strings */
5084 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005085 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005086
Guido van Rossumd57fd912000-03-10 22:53:23 +00005087 e = p + PyUnicode_GET_SIZE(self);
5088 for (; p < e; p++) {
5089 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005090 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005092 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005093}
5094
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005095PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005096"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005098Return True if all characters in S are digits\n\
5099and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005100
5101static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005102unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103{
5104 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5105 register const Py_UNICODE *e;
5106
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 /* Shortcut for single character strings */
5108 if (PyUnicode_GET_SIZE(self) == 1 &&
5109 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005110 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005112 /* Special case for empty strings */
5113 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005114 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005115
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116 e = p + PyUnicode_GET_SIZE(self);
5117 for (; p < e; p++) {
5118 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005119 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005121 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122}
5123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005124PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005125"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005127Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005128False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129
5130static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005131unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132{
5133 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5134 register const Py_UNICODE *e;
5135
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 /* Shortcut for single character strings */
5137 if (PyUnicode_GET_SIZE(self) == 1 &&
5138 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005139 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005141 /* Special case for empty strings */
5142 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005143 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005144
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 e = p + PyUnicode_GET_SIZE(self);
5146 for (; p < e; p++) {
5147 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005148 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005150 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151}
5152
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005153PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154"S.join(sequence) -> unicode\n\
5155\n\
5156Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005157sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158
5159static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005160unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005162 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163}
5164
5165static int
5166unicode_length(PyUnicodeObject *self)
5167{
5168 return self->length;
5169}
5170
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005171PyDoc_STRVAR(ljust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172"S.ljust(width) -> unicode\n\
5173\n\
5174Return S left justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005175done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176
5177static PyObject *
5178unicode_ljust(PyUnicodeObject *self, PyObject *args)
5179{
5180 int width;
5181 if (!PyArg_ParseTuple(args, "i:ljust", &width))
5182 return NULL;
5183
Tim Peters7a29bd52001-09-12 03:03:31 +00005184 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 Py_INCREF(self);
5186 return (PyObject*) self;
5187 }
5188
5189 return (PyObject*) pad(self, 0, width - self->length, ' ');
5190}
5191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005192PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193"S.lower() -> unicode\n\
5194\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005195Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196
5197static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005198unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200 return fixup(self, fixlower);
5201}
5202
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005203#define LEFTSTRIP 0
5204#define RIGHTSTRIP 1
5205#define BOTHSTRIP 2
5206
5207/* Arrays indexed by above */
5208static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5209
5210#define STRIPNAME(i) (stripformat[i]+3)
5211
5212static const Py_UNICODE *
5213unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5214{
Tim Peters030a5ce2002-04-22 19:00:10 +00005215 size_t i;
5216 for (i = 0; i < n; ++i)
5217 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005218 return s+i;
5219 return NULL;
5220}
5221
5222/* externally visible for str.strip(unicode) */
5223PyObject *
5224_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5225{
5226 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5227 int len = PyUnicode_GET_SIZE(self);
5228 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5229 int seplen = PyUnicode_GET_SIZE(sepobj);
5230 int i, j;
5231
5232 i = 0;
5233 if (striptype != RIGHTSTRIP) {
5234 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5235 i++;
5236 }
5237 }
5238
5239 j = len;
5240 if (striptype != LEFTSTRIP) {
5241 do {
5242 j--;
5243 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5244 j++;
5245 }
5246
5247 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5248 Py_INCREF(self);
5249 return (PyObject*)self;
5250 }
5251 else
5252 return PyUnicode_FromUnicode(s+i, j-i);
5253}
5254
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255
5256static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005257do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005259 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5260 int len = PyUnicode_GET_SIZE(self), i, j;
5261
5262 i = 0;
5263 if (striptype != RIGHTSTRIP) {
5264 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5265 i++;
5266 }
5267 }
5268
5269 j = len;
5270 if (striptype != LEFTSTRIP) {
5271 do {
5272 j--;
5273 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5274 j++;
5275 }
5276
5277 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5278 Py_INCREF(self);
5279 return (PyObject*)self;
5280 }
5281 else
5282 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283}
5284
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005285
5286static PyObject *
5287do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5288{
5289 PyObject *sep = NULL;
5290
5291 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5292 return NULL;
5293
5294 if (sep != NULL && sep != Py_None) {
5295 if (PyUnicode_Check(sep))
5296 return _PyUnicode_XStrip(self, striptype, sep);
5297 else if (PyString_Check(sep)) {
5298 PyObject *res;
5299 sep = PyUnicode_FromObject(sep);
5300 if (sep==NULL)
5301 return NULL;
5302 res = _PyUnicode_XStrip(self, striptype, sep);
5303 Py_DECREF(sep);
5304 return res;
5305 }
5306 else {
5307 PyErr_Format(PyExc_TypeError,
5308 "%s arg must be None, unicode or str",
5309 STRIPNAME(striptype));
5310 return NULL;
5311 }
5312 }
5313
5314 return do_strip(self, striptype);
5315}
5316
5317
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005318PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005319"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005320\n\
5321Return a copy of the string S with leading and trailing\n\
5322whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005323If chars is given and not None, remove characters in chars instead.\n\
5324If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005325
5326static PyObject *
5327unicode_strip(PyUnicodeObject *self, PyObject *args)
5328{
5329 if (PyTuple_GET_SIZE(args) == 0)
5330 return do_strip(self, BOTHSTRIP); /* Common case */
5331 else
5332 return do_argstrip(self, BOTHSTRIP, args);
5333}
5334
5335
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005336PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005337"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005338\n\
5339Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005340If chars is given and not None, remove characters in chars instead.\n\
5341If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005342
5343static PyObject *
5344unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5345{
5346 if (PyTuple_GET_SIZE(args) == 0)
5347 return do_strip(self, LEFTSTRIP); /* Common case */
5348 else
5349 return do_argstrip(self, LEFTSTRIP, args);
5350}
5351
5352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005353PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005354"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005355\n\
5356Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005357If chars is given and not None, remove characters in chars instead.\n\
5358If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005359
5360static PyObject *
5361unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5362{
5363 if (PyTuple_GET_SIZE(args) == 0)
5364 return do_strip(self, RIGHTSTRIP); /* Common case */
5365 else
5366 return do_argstrip(self, RIGHTSTRIP, args);
5367}
5368
5369
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370static PyObject*
5371unicode_repeat(PyUnicodeObject *str, int len)
5372{
5373 PyUnicodeObject *u;
5374 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005375 int nchars;
5376 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377
5378 if (len < 0)
5379 len = 0;
5380
Tim Peters7a29bd52001-09-12 03:03:31 +00005381 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382 /* no repeat, return original string */
5383 Py_INCREF(str);
5384 return (PyObject*) str;
5385 }
Tim Peters8f422462000-09-09 06:13:41 +00005386
5387 /* ensure # of chars needed doesn't overflow int and # of bytes
5388 * needed doesn't overflow size_t
5389 */
5390 nchars = len * str->length;
5391 if (len && nchars / len != str->length) {
5392 PyErr_SetString(PyExc_OverflowError,
5393 "repeated string is too long");
5394 return NULL;
5395 }
5396 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5397 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5398 PyErr_SetString(PyExc_OverflowError,
5399 "repeated string is too long");
5400 return NULL;
5401 }
5402 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 if (!u)
5404 return NULL;
5405
5406 p = u->str;
5407
5408 while (len-- > 0) {
5409 Py_UNICODE_COPY(p, str->str, str->length);
5410 p += str->length;
5411 }
5412
5413 return (PyObject*) u;
5414}
5415
5416PyObject *PyUnicode_Replace(PyObject *obj,
5417 PyObject *subobj,
5418 PyObject *replobj,
5419 int maxcount)
5420{
5421 PyObject *self;
5422 PyObject *str1;
5423 PyObject *str2;
5424 PyObject *result;
5425
5426 self = PyUnicode_FromObject(obj);
5427 if (self == NULL)
5428 return NULL;
5429 str1 = PyUnicode_FromObject(subobj);
5430 if (str1 == NULL) {
5431 Py_DECREF(self);
5432 return NULL;
5433 }
5434 str2 = PyUnicode_FromObject(replobj);
5435 if (str2 == NULL) {
5436 Py_DECREF(self);
5437 Py_DECREF(str1);
5438 return NULL;
5439 }
Tim Petersced69f82003-09-16 20:30:58 +00005440 result = replace((PyUnicodeObject *)self,
5441 (PyUnicodeObject *)str1,
5442 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 maxcount);
5444 Py_DECREF(self);
5445 Py_DECREF(str1);
5446 Py_DECREF(str2);
5447 return result;
5448}
5449
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005450PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451"S.replace (old, new[, maxsplit]) -> unicode\n\
5452\n\
5453Return a copy of S with all occurrences of substring\n\
5454old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005455given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456
5457static PyObject*
5458unicode_replace(PyUnicodeObject *self, PyObject *args)
5459{
5460 PyUnicodeObject *str1;
5461 PyUnicodeObject *str2;
5462 int maxcount = -1;
5463 PyObject *result;
5464
5465 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5466 return NULL;
5467 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5468 if (str1 == NULL)
5469 return NULL;
5470 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005471 if (str2 == NULL) {
5472 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005474 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475
5476 result = replace(self, str1, str2, maxcount);
5477
5478 Py_DECREF(str1);
5479 Py_DECREF(str2);
5480 return result;
5481}
5482
5483static
5484PyObject *unicode_repr(PyObject *unicode)
5485{
5486 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5487 PyUnicode_GET_SIZE(unicode),
5488 1);
5489}
5490
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005491PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492"S.rfind(sub [,start [,end]]) -> int\n\
5493\n\
5494Return the highest index in S where substring sub is found,\n\
5495such that sub is contained within s[start,end]. Optional\n\
5496arguments start and end are interpreted as in slice notation.\n\
5497\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005498Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499
5500static PyObject *
5501unicode_rfind(PyUnicodeObject *self, PyObject *args)
5502{
5503 PyUnicodeObject *substring;
5504 int start = 0;
5505 int end = INT_MAX;
5506 PyObject *result;
5507
Guido van Rossumb8872e62000-05-09 14:14:27 +00005508 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5509 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 return NULL;
5511 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5512 (PyObject *)substring);
5513 if (substring == NULL)
5514 return NULL;
5515
5516 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5517
5518 Py_DECREF(substring);
5519 return result;
5520}
5521
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005522PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523"S.rindex(sub [,start [,end]]) -> int\n\
5524\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005525Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526
5527static PyObject *
5528unicode_rindex(PyUnicodeObject *self, PyObject *args)
5529{
5530 int result;
5531 PyUnicodeObject *substring;
5532 int start = 0;
5533 int end = INT_MAX;
5534
Guido van Rossumb8872e62000-05-09 14:14:27 +00005535 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5536 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537 return NULL;
5538 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5539 (PyObject *)substring);
5540 if (substring == NULL)
5541 return NULL;
5542
5543 result = findstring(self, substring, start, end, -1);
5544
5545 Py_DECREF(substring);
5546 if (result < 0) {
5547 PyErr_SetString(PyExc_ValueError, "substring not found");
5548 return NULL;
5549 }
5550 return PyInt_FromLong(result);
5551}
5552
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005553PyDoc_STRVAR(rjust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554"S.rjust(width) -> unicode\n\
5555\n\
5556Return S right justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005557done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558
5559static PyObject *
5560unicode_rjust(PyUnicodeObject *self, PyObject *args)
5561{
5562 int width;
5563 if (!PyArg_ParseTuple(args, "i:rjust", &width))
5564 return NULL;
5565
Tim Peters7a29bd52001-09-12 03:03:31 +00005566 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567 Py_INCREF(self);
5568 return (PyObject*) self;
5569 }
5570
5571 return (PyObject*) pad(self, width - self->length, 0, ' ');
5572}
5573
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574static PyObject*
5575unicode_slice(PyUnicodeObject *self, int start, int end)
5576{
5577 /* standard clamping */
5578 if (start < 0)
5579 start = 0;
5580 if (end < 0)
5581 end = 0;
5582 if (end > self->length)
5583 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005584 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585 /* full slice, return original string */
5586 Py_INCREF(self);
5587 return (PyObject*) self;
5588 }
5589 if (start > end)
5590 start = end;
5591 /* copy slice */
5592 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5593 end - start);
5594}
5595
5596PyObject *PyUnicode_Split(PyObject *s,
5597 PyObject *sep,
5598 int maxsplit)
5599{
5600 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005601
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602 s = PyUnicode_FromObject(s);
5603 if (s == NULL)
5604 return NULL;
5605 if (sep != NULL) {
5606 sep = PyUnicode_FromObject(sep);
5607 if (sep == NULL) {
5608 Py_DECREF(s);
5609 return NULL;
5610 }
5611 }
5612
5613 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5614
5615 Py_DECREF(s);
5616 Py_XDECREF(sep);
5617 return result;
5618}
5619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005620PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621"S.split([sep [,maxsplit]]) -> list of strings\n\
5622\n\
5623Return a list of the words in S, using sep as the\n\
5624delimiter string. If maxsplit is given, at most maxsplit\n\
5625splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005626is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627
5628static PyObject*
5629unicode_split(PyUnicodeObject *self, PyObject *args)
5630{
5631 PyObject *substring = Py_None;
5632 int maxcount = -1;
5633
5634 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5635 return NULL;
5636
5637 if (substring == Py_None)
5638 return split(self, NULL, maxcount);
5639 else if (PyUnicode_Check(substring))
5640 return split(self, (PyUnicodeObject *)substring, maxcount);
5641 else
5642 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5643}
5644
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005645PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00005646"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647\n\
5648Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00005649Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005650is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651
5652static PyObject*
5653unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5654{
Guido van Rossum86662912000-04-11 15:38:46 +00005655 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656
Guido van Rossum86662912000-04-11 15:38:46 +00005657 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658 return NULL;
5659
Guido van Rossum86662912000-04-11 15:38:46 +00005660 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661}
5662
5663static
5664PyObject *unicode_str(PyUnicodeObject *self)
5665{
Fred Drakee4315f52000-05-09 19:53:39 +00005666 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667}
5668
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005669PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670"S.swapcase() -> unicode\n\
5671\n\
5672Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005673and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674
5675static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005676unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 return fixup(self, fixswapcase);
5679}
5680
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005681PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682"S.translate(table) -> unicode\n\
5683\n\
5684Return a copy of the string S, where all characters have been mapped\n\
5685through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00005686Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5687Unmapped characters are left untouched. Characters mapped to None\n\
5688are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689
5690static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005691unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692{
Tim Petersced69f82003-09-16 20:30:58 +00005693 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00005695 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696 "ignore");
5697}
5698
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005699PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700"S.upper() -> unicode\n\
5701\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005702Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703
5704static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005705unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707 return fixup(self, fixupper);
5708}
5709
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005710PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711"S.zfill(width) -> unicode\n\
5712\n\
5713Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005714of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715
5716static PyObject *
5717unicode_zfill(PyUnicodeObject *self, PyObject *args)
5718{
5719 int fill;
5720 PyUnicodeObject *u;
5721
5722 int width;
5723 if (!PyArg_ParseTuple(args, "i:zfill", &width))
5724 return NULL;
5725
5726 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00005727 if (PyUnicode_CheckExact(self)) {
5728 Py_INCREF(self);
5729 return (PyObject*) self;
5730 }
5731 else
5732 return PyUnicode_FromUnicode(
5733 PyUnicode_AS_UNICODE(self),
5734 PyUnicode_GET_SIZE(self)
5735 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 }
5737
5738 fill = width - self->length;
5739
5740 u = pad(self, fill, 0, '0');
5741
Walter Dörwald068325e2002-04-15 13:36:47 +00005742 if (u == NULL)
5743 return NULL;
5744
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745 if (u->str[fill] == '+' || u->str[fill] == '-') {
5746 /* move sign to beginning of string */
5747 u->str[0] = u->str[fill];
5748 u->str[fill] = '0';
5749 }
5750
5751 return (PyObject*) u;
5752}
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753
5754#if 0
5755static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005756unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758 return PyInt_FromLong(unicode_freelist_size);
5759}
5760#endif
5761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005762PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005763"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00005765Return True if S starts with the specified prefix, False otherwise.\n\
5766With optional start, test S beginning at that position.\n\
5767With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768
5769static PyObject *
5770unicode_startswith(PyUnicodeObject *self,
5771 PyObject *args)
5772{
5773 PyUnicodeObject *substring;
5774 int start = 0;
5775 int end = INT_MAX;
5776 PyObject *result;
5777
Guido van Rossumb8872e62000-05-09 14:14:27 +00005778 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5779 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780 return NULL;
5781 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5782 (PyObject *)substring);
5783 if (substring == NULL)
5784 return NULL;
5785
Guido van Rossum77f6a652002-04-03 22:41:51 +00005786 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787
5788 Py_DECREF(substring);
5789 return result;
5790}
5791
5792
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005793PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005794"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00005796Return True if S ends with the specified suffix, False otherwise.\n\
5797With optional start, test S beginning at that position.\n\
5798With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799
5800static PyObject *
5801unicode_endswith(PyUnicodeObject *self,
5802 PyObject *args)
5803{
5804 PyUnicodeObject *substring;
5805 int start = 0;
5806 int end = INT_MAX;
5807 PyObject *result;
5808
Guido van Rossumb8872e62000-05-09 14:14:27 +00005809 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5810 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811 return NULL;
5812 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5813 (PyObject *)substring);
5814 if (substring == NULL)
5815 return NULL;
5816
Guido van Rossum77f6a652002-04-03 22:41:51 +00005817 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818
5819 Py_DECREF(substring);
5820 return result;
5821}
5822
5823
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005824
5825static PyObject *
5826unicode_getnewargs(PyUnicodeObject *v)
5827{
5828 return Py_BuildValue("(u#)", v->str, v->length);
5829}
5830
5831
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832static PyMethodDef unicode_methods[] = {
5833
5834 /* Order is according to common usage: often used methods should
5835 appear first, since lookup is done sequentially. */
5836
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005837 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5838 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5839 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5840 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5841 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5842 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5843 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5844 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5845 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5846 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5847 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5848 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5849 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005850 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005851/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5852 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5853 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5854 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005855 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005856 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005857 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005858 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5859 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5860 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5861 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5862 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5863 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5864 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5865 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5866 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5867 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5868 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5869 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5870 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5871 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005872 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00005873#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005874 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875#endif
5876
5877#if 0
5878 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005879 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880#endif
5881
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005882 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883 {NULL, NULL}
5884};
5885
Neil Schemenauerce30bc92002-11-18 16:10:18 +00005886static PyObject *
5887unicode_mod(PyObject *v, PyObject *w)
5888{
5889 if (!PyUnicode_Check(v)) {
5890 Py_INCREF(Py_NotImplemented);
5891 return Py_NotImplemented;
5892 }
5893 return PyUnicode_Format(v, w);
5894}
5895
5896static PyNumberMethods unicode_as_number = {
5897 0, /*nb_add*/
5898 0, /*nb_subtract*/
5899 0, /*nb_multiply*/
5900 0, /*nb_divide*/
5901 unicode_mod, /*nb_remainder*/
5902};
5903
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904static PySequenceMethods unicode_as_sequence = {
5905 (inquiry) unicode_length, /* sq_length */
5906 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5907 (intargfunc) unicode_repeat, /* sq_repeat */
5908 (intargfunc) unicode_getitem, /* sq_item */
5909 (intintargfunc) unicode_slice, /* sq_slice */
5910 0, /* sq_ass_item */
5911 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005912 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913};
5914
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005915static PyObject*
5916unicode_subscript(PyUnicodeObject* self, PyObject* item)
5917{
5918 if (PyInt_Check(item)) {
5919 long i = PyInt_AS_LONG(item);
5920 if (i < 0)
5921 i += PyString_GET_SIZE(self);
5922 return unicode_getitem(self, i);
5923 } else if (PyLong_Check(item)) {
5924 long i = PyLong_AsLong(item);
5925 if (i == -1 && PyErr_Occurred())
5926 return NULL;
5927 if (i < 0)
5928 i += PyString_GET_SIZE(self);
5929 return unicode_getitem(self, i);
5930 } else if (PySlice_Check(item)) {
5931 int start, stop, step, slicelength, cur, i;
5932 Py_UNICODE* source_buf;
5933 Py_UNICODE* result_buf;
5934 PyObject* result;
5935
5936 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5937 &start, &stop, &step, &slicelength) < 0) {
5938 return NULL;
5939 }
5940
5941 if (slicelength <= 0) {
5942 return PyUnicode_FromUnicode(NULL, 0);
5943 } else {
5944 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5945 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5946
5947 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5948 result_buf[i] = source_buf[cur];
5949 }
Tim Petersced69f82003-09-16 20:30:58 +00005950
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005951 result = PyUnicode_FromUnicode(result_buf, slicelength);
5952 PyMem_FREE(result_buf);
5953 return result;
5954 }
5955 } else {
5956 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5957 return NULL;
5958 }
5959}
5960
5961static PyMappingMethods unicode_as_mapping = {
5962 (inquiry)unicode_length, /* mp_length */
5963 (binaryfunc)unicode_subscript, /* mp_subscript */
5964 (objobjargproc)0, /* mp_ass_subscript */
5965};
5966
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967static int
5968unicode_buffer_getreadbuf(PyUnicodeObject *self,
5969 int index,
5970 const void **ptr)
5971{
5972 if (index != 0) {
5973 PyErr_SetString(PyExc_SystemError,
5974 "accessing non-existent unicode segment");
5975 return -1;
5976 }
5977 *ptr = (void *) self->str;
5978 return PyUnicode_GET_DATA_SIZE(self);
5979}
5980
5981static int
5982unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5983 const void **ptr)
5984{
5985 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00005986 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 return -1;
5988}
5989
5990static int
5991unicode_buffer_getsegcount(PyUnicodeObject *self,
5992 int *lenp)
5993{
5994 if (lenp)
5995 *lenp = PyUnicode_GET_DATA_SIZE(self);
5996 return 1;
5997}
5998
5999static int
6000unicode_buffer_getcharbuf(PyUnicodeObject *self,
6001 int index,
6002 const void **ptr)
6003{
6004 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006005
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 if (index != 0) {
6007 PyErr_SetString(PyExc_SystemError,
6008 "accessing non-existent unicode segment");
6009 return -1;
6010 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006011 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 if (str == NULL)
6013 return -1;
6014 *ptr = (void *) PyString_AS_STRING(str);
6015 return PyString_GET_SIZE(str);
6016}
6017
6018/* Helpers for PyUnicode_Format() */
6019
6020static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006021getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022{
6023 int argidx = *p_argidx;
6024 if (argidx < arglen) {
6025 (*p_argidx)++;
6026 if (arglen < 0)
6027 return args;
6028 else
6029 return PyTuple_GetItem(args, argidx);
6030 }
6031 PyErr_SetString(PyExc_TypeError,
6032 "not enough arguments for format string");
6033 return NULL;
6034}
6035
6036#define F_LJUST (1<<0)
6037#define F_SIGN (1<<1)
6038#define F_BLANK (1<<2)
6039#define F_ALT (1<<3)
6040#define F_ZERO (1<<4)
6041
6042static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044{
6045 register int i;
6046 int len;
6047 va_list va;
6048 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050
6051 /* First, format the string as char array, then expand to Py_UNICODE
6052 array. */
6053 charbuffer = (char *)buffer;
6054 len = vsprintf(charbuffer, format, va);
6055 for (i = len - 1; i >= 0; i--)
6056 buffer[i] = (Py_UNICODE) charbuffer[i];
6057
6058 va_end(va);
6059 return len;
6060}
6061
Guido van Rossum078151d2002-08-11 04:24:12 +00006062/* XXX To save some code duplication, formatfloat/long/int could have been
6063 shared with stringobject.c, converting from 8-bit to Unicode after the
6064 formatting is done. */
6065
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066static int
6067formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006068 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 int flags,
6070 int prec,
6071 int type,
6072 PyObject *v)
6073{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006074 /* fmt = '%#.' + `prec` + `type`
6075 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 char fmt[20];
6077 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006078
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 x = PyFloat_AsDouble(v);
6080 if (x == -1.0 && PyErr_Occurred())
6081 return -1;
6082 if (prec < 0)
6083 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6085 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006086 /* Worst case length calc to ensure no buffer overrun:
6087
6088 'g' formats:
6089 fmt = %#.<prec>g
6090 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6091 for any double rep.)
6092 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6093
6094 'f' formats:
6095 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6096 len = 1 + 50 + 1 + prec = 52 + prec
6097
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006098 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006099 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006100
6101 */
6102 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6103 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006104 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006105 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006106 return -1;
6107 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006108 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6109 (flags&F_ALT) ? "#" : "",
6110 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 return usprintf(buf, fmt, x);
6112}
6113
Tim Peters38fd5b62000-09-21 05:43:11 +00006114static PyObject*
6115formatlong(PyObject *val, int flags, int prec, int type)
6116{
6117 char *buf;
6118 int i, len;
6119 PyObject *str; /* temporary string object. */
6120 PyUnicodeObject *result;
6121
6122 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6123 if (!str)
6124 return NULL;
6125 result = _PyUnicode_New(len);
6126 for (i = 0; i < len; i++)
6127 result->str[i] = buf[i];
6128 result->str[len] = 0;
6129 Py_DECREF(str);
6130 return (PyObject*)result;
6131}
6132
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133static int
6134formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006135 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 int flags,
6137 int prec,
6138 int type,
6139 PyObject *v)
6140{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006141 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006142 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6143 * + 1 + 1
6144 * = 24
6145 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006146 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147 long x;
6148
6149 x = PyInt_AsLong(v);
6150 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006151 return -1;
Guido van Rossum078151d2002-08-11 04:24:12 +00006152 if (x < 0 && type != 'd' && type != 'i') {
Guido van Rossum54df53a2002-08-14 18:38:27 +00006153 if (PyErr_Warn(PyExc_FutureWarning,
Guido van Rossum078151d2002-08-11 04:24:12 +00006154 "%u/%o/%x/%X of negative int will return "
6155 "a signed string in Python 2.4 and up") < 0)
6156 return -1;
6157 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006159 prec = 1;
6160
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006161 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006162 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
6163 */
6164 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006165 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006166 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006167 return -1;
6168 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006169
6170 if ((flags & F_ALT) &&
6171 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006172 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006173 * of issues that cause pain:
6174 * - when 0 is being converted, the C standard leaves off
6175 * the '0x' or '0X', which is inconsistent with other
6176 * %#x/%#X conversions and inconsistent with Python's
6177 * hex() function
6178 * - there are platforms that violate the standard and
6179 * convert 0 with the '0x' or '0X'
6180 * (Metrowerks, Compaq Tru64)
6181 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006182 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006183 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006184 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006185 * We can achieve the desired consistency by inserting our
6186 * own '0x' or '0X' prefix, and substituting %x/%X in place
6187 * of %#x/%#X.
6188 *
6189 * Note that this is the same approach as used in
6190 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006191 */
Tim Petersced69f82003-09-16 20:30:58 +00006192 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006193 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006194 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006195 else {
6196 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
Tim Petersced69f82003-09-16 20:30:58 +00006197 (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006198 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 return usprintf(buf, fmt, x);
6201}
6202
6203static int
6204formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006205 size_t buflen,
6206 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006208 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006209 if (PyUnicode_Check(v)) {
6210 if (PyUnicode_GET_SIZE(v) != 1)
6211 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006215 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006216 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006217 goto onError;
6218 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6219 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220
6221 else {
6222 /* Integer input truncated to a character */
6223 long x;
6224 x = PyInt_AsLong(v);
6225 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006226 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006227#ifdef Py_UNICODE_WIDE
6228 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006229 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006230 "%c arg not in range(0x110000) "
6231 "(wide Python build)");
6232 return -1;
6233 }
6234#else
6235 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006236 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006237 "%c arg not in range(0x10000) "
6238 "(narrow Python build)");
6239 return -1;
6240 }
6241#endif
6242 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 }
6244 buf[1] = '\0';
6245 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006246
6247 onError:
6248 PyErr_SetString(PyExc_TypeError,
6249 "%c requires int or char");
6250 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251}
6252
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006253/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6254
6255 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6256 chars are formatted. XXX This is a magic number. Each formatting
6257 routine does bounds checking to ensure no overflow, but a better
6258 solution may be to malloc a buffer of appropriate size for each
6259 format. For now, the current solution is sufficient.
6260*/
6261#define FORMATBUFLEN (size_t)120
6262
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263PyObject *PyUnicode_Format(PyObject *format,
6264 PyObject *args)
6265{
6266 Py_UNICODE *fmt, *res;
6267 int fmtcnt, rescnt, reslen, arglen, argidx;
6268 int args_owned = 0;
6269 PyUnicodeObject *result = NULL;
6270 PyObject *dict = NULL;
6271 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006272
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273 if (format == NULL || args == NULL) {
6274 PyErr_BadInternalCall();
6275 return NULL;
6276 }
6277 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006278 if (uformat == NULL)
6279 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 fmt = PyUnicode_AS_UNICODE(uformat);
6281 fmtcnt = PyUnicode_GET_SIZE(uformat);
6282
6283 reslen = rescnt = fmtcnt + 100;
6284 result = _PyUnicode_New(reslen);
6285 if (result == NULL)
6286 goto onError;
6287 res = PyUnicode_AS_UNICODE(result);
6288
6289 if (PyTuple_Check(args)) {
6290 arglen = PyTuple_Size(args);
6291 argidx = 0;
6292 }
6293 else {
6294 arglen = -1;
6295 argidx = -2;
6296 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006297 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6298 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299 dict = args;
6300
6301 while (--fmtcnt >= 0) {
6302 if (*fmt != '%') {
6303 if (--rescnt < 0) {
6304 rescnt = fmtcnt + 100;
6305 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006306 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307 return NULL;
6308 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6309 --rescnt;
6310 }
6311 *res++ = *fmt++;
6312 }
6313 else {
6314 /* Got a format specifier */
6315 int flags = 0;
6316 int width = -1;
6317 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318 Py_UNICODE c = '\0';
6319 Py_UNICODE fill;
6320 PyObject *v = NULL;
6321 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006322 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323 Py_UNICODE sign;
6324 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006325 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326
6327 fmt++;
6328 if (*fmt == '(') {
6329 Py_UNICODE *keystart;
6330 int keylen;
6331 PyObject *key;
6332 int pcount = 1;
6333
6334 if (dict == NULL) {
6335 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006336 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337 goto onError;
6338 }
6339 ++fmt;
6340 --fmtcnt;
6341 keystart = fmt;
6342 /* Skip over balanced parentheses */
6343 while (pcount > 0 && --fmtcnt >= 0) {
6344 if (*fmt == ')')
6345 --pcount;
6346 else if (*fmt == '(')
6347 ++pcount;
6348 fmt++;
6349 }
6350 keylen = fmt - keystart - 1;
6351 if (fmtcnt < 0 || pcount > 0) {
6352 PyErr_SetString(PyExc_ValueError,
6353 "incomplete format key");
6354 goto onError;
6355 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006356#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006357 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358 then looked up since Python uses strings to hold
6359 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006360 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361 key = PyUnicode_EncodeUTF8(keystart,
6362 keylen,
6363 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006364#else
6365 key = PyUnicode_FromUnicode(keystart, keylen);
6366#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367 if (key == NULL)
6368 goto onError;
6369 if (args_owned) {
6370 Py_DECREF(args);
6371 args_owned = 0;
6372 }
6373 args = PyObject_GetItem(dict, key);
6374 Py_DECREF(key);
6375 if (args == NULL) {
6376 goto onError;
6377 }
6378 args_owned = 1;
6379 arglen = -1;
6380 argidx = -2;
6381 }
6382 while (--fmtcnt >= 0) {
6383 switch (c = *fmt++) {
6384 case '-': flags |= F_LJUST; continue;
6385 case '+': flags |= F_SIGN; continue;
6386 case ' ': flags |= F_BLANK; continue;
6387 case '#': flags |= F_ALT; continue;
6388 case '0': flags |= F_ZERO; continue;
6389 }
6390 break;
6391 }
6392 if (c == '*') {
6393 v = getnextarg(args, arglen, &argidx);
6394 if (v == NULL)
6395 goto onError;
6396 if (!PyInt_Check(v)) {
6397 PyErr_SetString(PyExc_TypeError,
6398 "* wants int");
6399 goto onError;
6400 }
6401 width = PyInt_AsLong(v);
6402 if (width < 0) {
6403 flags |= F_LJUST;
6404 width = -width;
6405 }
6406 if (--fmtcnt >= 0)
6407 c = *fmt++;
6408 }
6409 else if (c >= '0' && c <= '9') {
6410 width = c - '0';
6411 while (--fmtcnt >= 0) {
6412 c = *fmt++;
6413 if (c < '0' || c > '9')
6414 break;
6415 if ((width*10) / 10 != width) {
6416 PyErr_SetString(PyExc_ValueError,
6417 "width too big");
6418 goto onError;
6419 }
6420 width = width*10 + (c - '0');
6421 }
6422 }
6423 if (c == '.') {
6424 prec = 0;
6425 if (--fmtcnt >= 0)
6426 c = *fmt++;
6427 if (c == '*') {
6428 v = getnextarg(args, arglen, &argidx);
6429 if (v == NULL)
6430 goto onError;
6431 if (!PyInt_Check(v)) {
6432 PyErr_SetString(PyExc_TypeError,
6433 "* wants int");
6434 goto onError;
6435 }
6436 prec = PyInt_AsLong(v);
6437 if (prec < 0)
6438 prec = 0;
6439 if (--fmtcnt >= 0)
6440 c = *fmt++;
6441 }
6442 else if (c >= '0' && c <= '9') {
6443 prec = c - '0';
6444 while (--fmtcnt >= 0) {
6445 c = Py_CHARMASK(*fmt++);
6446 if (c < '0' || c > '9')
6447 break;
6448 if ((prec*10) / 10 != prec) {
6449 PyErr_SetString(PyExc_ValueError,
6450 "prec too big");
6451 goto onError;
6452 }
6453 prec = prec*10 + (c - '0');
6454 }
6455 }
6456 } /* prec */
6457 if (fmtcnt >= 0) {
6458 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 if (--fmtcnt >= 0)
6460 c = *fmt++;
6461 }
6462 }
6463 if (fmtcnt < 0) {
6464 PyErr_SetString(PyExc_ValueError,
6465 "incomplete format");
6466 goto onError;
6467 }
6468 if (c != '%') {
6469 v = getnextarg(args, arglen, &argidx);
6470 if (v == NULL)
6471 goto onError;
6472 }
6473 sign = 0;
6474 fill = ' ';
6475 switch (c) {
6476
6477 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006478 pbuf = formatbuf;
6479 /* presume that buffer length is at least 1 */
6480 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 len = 1;
6482 break;
6483
6484 case 's':
6485 case 'r':
6486 if (PyUnicode_Check(v) && c == 's') {
6487 temp = v;
6488 Py_INCREF(temp);
6489 }
6490 else {
6491 PyObject *unicode;
6492 if (c == 's')
6493 temp = PyObject_Str(v);
6494 else
6495 temp = PyObject_Repr(v);
6496 if (temp == NULL)
6497 goto onError;
6498 if (!PyString_Check(temp)) {
6499 /* XXX Note: this should never happen, since
6500 PyObject_Repr() and PyObject_Str() assure
6501 this */
6502 Py_DECREF(temp);
6503 PyErr_SetString(PyExc_TypeError,
6504 "%s argument has non-string str()");
6505 goto onError;
6506 }
Fred Drakee4315f52000-05-09 19:53:39 +00006507 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006509 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 "strict");
6511 Py_DECREF(temp);
6512 temp = unicode;
6513 if (temp == NULL)
6514 goto onError;
6515 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006516 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 len = PyUnicode_GET_SIZE(temp);
6518 if (prec >= 0 && len > prec)
6519 len = prec;
6520 break;
6521
6522 case 'i':
6523 case 'd':
6524 case 'u':
6525 case 'o':
6526 case 'x':
6527 case 'X':
6528 if (c == 'i')
6529 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006530 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006531 temp = formatlong(v, flags, prec, c);
6532 if (!temp)
6533 goto onError;
6534 pbuf = PyUnicode_AS_UNICODE(temp);
6535 len = PyUnicode_GET_SIZE(temp);
6536 /* unbounded ints can always produce
6537 a sign character! */
6538 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006540 else {
6541 pbuf = formatbuf;
6542 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6543 flags, prec, c, v);
6544 if (len < 0)
6545 goto onError;
6546 /* only d conversion is signed */
6547 sign = c == 'd';
6548 }
6549 if (flags & F_ZERO)
6550 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 break;
6552
6553 case 'e':
6554 case 'E':
6555 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006556 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557 case 'g':
6558 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006559 if (c == 'F')
6560 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006561 pbuf = formatbuf;
6562 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6563 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564 if (len < 0)
6565 goto onError;
6566 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006567 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568 fill = '0';
6569 break;
6570
6571 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006572 pbuf = formatbuf;
6573 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 if (len < 0)
6575 goto onError;
6576 break;
6577
6578 default:
6579 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006580 "unsupported format character '%c' (0x%x) "
6581 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00006582 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006583 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006584 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 goto onError;
6586 }
6587 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006588 if (*pbuf == '-' || *pbuf == '+') {
6589 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590 len--;
6591 }
6592 else if (flags & F_SIGN)
6593 sign = '+';
6594 else if (flags & F_BLANK)
6595 sign = ' ';
6596 else
6597 sign = 0;
6598 }
6599 if (width < len)
6600 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006601 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 reslen -= rescnt;
6603 rescnt = width + fmtcnt + 100;
6604 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006605 if (reslen < 0) {
6606 Py_DECREF(result);
6607 return PyErr_NoMemory();
6608 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006609 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610 return NULL;
6611 res = PyUnicode_AS_UNICODE(result)
6612 + reslen - rescnt;
6613 }
6614 if (sign) {
6615 if (fill != ' ')
6616 *res++ = sign;
6617 rescnt--;
6618 if (width > len)
6619 width--;
6620 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006621 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6622 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006623 assert(pbuf[1] == c);
6624 if (fill != ' ') {
6625 *res++ = *pbuf++;
6626 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00006627 }
Tim Petersfff53252001-04-12 18:38:48 +00006628 rescnt -= 2;
6629 width -= 2;
6630 if (width < 0)
6631 width = 0;
6632 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00006633 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 if (width > len && !(flags & F_LJUST)) {
6635 do {
6636 --rescnt;
6637 *res++ = fill;
6638 } while (--width > len);
6639 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006640 if (fill == ' ') {
6641 if (sign)
6642 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00006643 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006644 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006645 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00006646 *res++ = *pbuf++;
6647 *res++ = *pbuf++;
6648 }
6649 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006650 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 res += len;
6652 rescnt -= len;
6653 while (--width >= len) {
6654 --rescnt;
6655 *res++ = ' ';
6656 }
6657 if (dict && (argidx < arglen) && c != '%') {
6658 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006659 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 goto onError;
6661 }
6662 Py_XDECREF(temp);
6663 } /* '%' */
6664 } /* until end */
6665 if (argidx < arglen && !dict) {
6666 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006667 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 goto onError;
6669 }
6670
6671 if (args_owned) {
6672 Py_DECREF(args);
6673 }
6674 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00006675 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006676 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 return (PyObject *)result;
6678
6679 onError:
6680 Py_XDECREF(result);
6681 Py_DECREF(uformat);
6682 if (args_owned) {
6683 Py_DECREF(args);
6684 }
6685 return NULL;
6686}
6687
6688static PyBufferProcs unicode_as_buffer = {
6689 (getreadbufferproc) unicode_buffer_getreadbuf,
6690 (getwritebufferproc) unicode_buffer_getwritebuf,
6691 (getsegcountproc) unicode_buffer_getsegcount,
6692 (getcharbufferproc) unicode_buffer_getcharbuf,
6693};
6694
Jeremy Hylton938ace62002-07-17 16:30:39 +00006695static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00006696unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6697
Tim Peters6d6c1a32001-08-02 04:15:00 +00006698static PyObject *
6699unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6700{
6701 PyObject *x = NULL;
6702 static char *kwlist[] = {"string", "encoding", "errors", 0};
6703 char *encoding = NULL;
6704 char *errors = NULL;
6705
Guido van Rossume023fe02001-08-30 03:12:59 +00006706 if (type != &PyUnicode_Type)
6707 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00006708 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6709 kwlist, &x, &encoding, &errors))
6710 return NULL;
6711 if (x == NULL)
6712 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00006713 if (encoding == NULL && errors == NULL)
6714 return PyObject_Unicode(x);
6715 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00006716 return PyUnicode_FromEncodedObject(x, encoding, errors);
6717}
6718
Guido van Rossume023fe02001-08-30 03:12:59 +00006719static PyObject *
6720unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6721{
Tim Petersaf90b3e2001-09-12 05:18:58 +00006722 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006723 int n;
6724
6725 assert(PyType_IsSubtype(type, &PyUnicode_Type));
6726 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6727 if (tmp == NULL)
6728 return NULL;
6729 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00006730 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006731 if (pnew == NULL) {
6732 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00006733 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00006734 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006735 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6736 if (pnew->str == NULL) {
6737 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006738 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006739 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00006740 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00006741 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006742 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6743 pnew->length = n;
6744 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00006745 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00006746 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006747}
6748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006749PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00006750"unicode(string [, encoding[, errors]]) -> object\n\
6751\n\
6752Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00006753encoding defaults to the current default string encoding.\n\
6754errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00006755
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756PyTypeObject PyUnicode_Type = {
6757 PyObject_HEAD_INIT(&PyType_Type)
6758 0, /* ob_size */
6759 "unicode", /* tp_name */
6760 sizeof(PyUnicodeObject), /* tp_size */
6761 0, /* tp_itemsize */
6762 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00006763 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006765 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766 0, /* tp_setattr */
6767 (cmpfunc) unicode_compare, /* tp_compare */
6768 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006769 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006771 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772 (hashfunc) unicode_hash, /* tp_hash*/
6773 0, /* tp_call*/
6774 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006775 PyObject_GenericGetAttr, /* tp_getattro */
6776 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006778 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
6779 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006780 unicode_doc, /* tp_doc */
6781 0, /* tp_traverse */
6782 0, /* tp_clear */
6783 0, /* tp_richcompare */
6784 0, /* tp_weaklistoffset */
6785 0, /* tp_iter */
6786 0, /* tp_iternext */
6787 unicode_methods, /* tp_methods */
6788 0, /* tp_members */
6789 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00006790 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006791 0, /* tp_dict */
6792 0, /* tp_descr_get */
6793 0, /* tp_descr_set */
6794 0, /* tp_dictoffset */
6795 0, /* tp_init */
6796 0, /* tp_alloc */
6797 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006798 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799};
6800
6801/* Initialize the Unicode implementation */
6802
Thomas Wouters78890102000-07-22 19:25:51 +00006803void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006805 int i;
6806
Fred Drakee4315f52000-05-09 19:53:39 +00006807 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006808 unicode_freelist = NULL;
6809 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00006811 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006812 for (i = 0; i < 256; i++)
6813 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00006814 if (PyType_Ready(&PyUnicode_Type) < 0)
6815 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816}
6817
6818/* Finalize the Unicode implementation */
6819
6820void
Thomas Wouters78890102000-07-22 19:25:51 +00006821_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006823 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006824 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00006826 Py_XDECREF(unicode_empty);
6827 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006828
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006829 for (i = 0; i < 256; i++) {
6830 if (unicode_latin1[i]) {
6831 Py_DECREF(unicode_latin1[i]);
6832 unicode_latin1[i] = NULL;
6833 }
6834 }
6835
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006836 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837 PyUnicodeObject *v = u;
6838 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006839 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00006840 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006841 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006842 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006844 unicode_freelist = NULL;
6845 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006847
6848/*
6849Local variables:
6850c-basic-offset: 4
6851indent-tabs-mode: nil
6852End:
6853*/