blob: e4fe53169567c7711e79e560de3092b6ea219bc5 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
157 unicode->length = length;
158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000266int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
371 for (i = size; i >= 0; i--)
372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
387 if (size > PyUnicode_GET_SIZE(unicode))
388 size = PyUnicode_GET_SIZE(unicode);
389#ifdef HAVE_USABLE_WCHAR_T
390 memcpy(w, unicode->str, size * sizeof(wchar_t));
391#else
392 {
393 register Py_UNICODE *u;
394 register int i;
395 u = PyUnicode_AS_UNICODE(unicode);
396 for (i = size; i >= 0; i--)
397 *w++ = *u++;
398 }
399#endif
400
401 return size;
402}
403
404#endif
405
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000406PyObject *PyUnicode_FromOrdinal(int ordinal)
407{
408 Py_UNICODE s[2];
409
410#ifdef Py_UNICODE_WIDE
411 if (ordinal < 0 || ordinal > 0x10ffff) {
412 PyErr_SetString(PyExc_ValueError,
413 "unichr() arg not in range(0x110000) "
414 "(wide Python build)");
415 return NULL;
416 }
417#else
418 if (ordinal < 0 || ordinal > 0xffff) {
419 PyErr_SetString(PyExc_ValueError,
420 "unichr() arg not in range(0x10000) "
421 "(narrow Python build)");
422 return NULL;
423 }
424#endif
425
426 if (ordinal <= 0xffff) {
427 /* UCS-2 character */
428 s[0] = (Py_UNICODE) ordinal;
429 return PyUnicode_FromUnicode(s, 1);
430 }
431 else {
432#ifndef Py_UNICODE_WIDE
433 /* UCS-4 character. store as two surrogate characters */
434 ordinal -= 0x10000L;
435 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
436 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
437 return PyUnicode_FromUnicode(s, 2);
438#else
439 s[0] = (Py_UNICODE)ordinal;
440 return PyUnicode_FromUnicode(s, 1);
441#endif
442 }
443}
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445PyObject *PyUnicode_FromObject(register PyObject *obj)
446{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000447 /* XXX Perhaps we should make this API an alias of
448 PyObject_Unicode() instead ?! */
449 if (PyUnicode_CheckExact(obj)) {
450 Py_INCREF(obj);
451 return obj;
452 }
453 if (PyUnicode_Check(obj)) {
454 /* For a Unicode subtype that's not a Unicode object,
455 return a true Unicode object with the same data. */
456 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
457 PyUnicode_GET_SIZE(obj));
458 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
460}
461
462PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
463 const char *encoding,
464 const char *errors)
465{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000466 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000468 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000469
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470 if (obj == NULL) {
471 PyErr_BadInternalCall();
472 return NULL;
473 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000474
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000475#if 0
476 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000477 that no encodings is given and then redirect to
478 PyObject_Unicode() which then applies the additional logic for
479 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000480
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000481 NOTE: This API should really only be used for object which
482 represent *encoded* Unicode !
483
484 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000485 if (PyUnicode_Check(obj)) {
486 if (encoding) {
487 PyErr_SetString(PyExc_TypeError,
488 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000489 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000490 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000492 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000493#else
494 if (PyUnicode_Check(obj)) {
495 PyErr_SetString(PyExc_TypeError,
496 "decoding Unicode is not supported");
497 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000498 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000499#endif
500
501 /* Coerce object */
502 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000503 s = PyString_AS_STRING(obj);
504 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000505 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000506 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
507 /* Overwrite the error message with something more useful in
508 case of a TypeError. */
509 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000511 "coercing to Unicode: need string or buffer, "
512 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000513 obj->ob_type->tp_name);
514 goto onError;
515 }
Tim Petersced69f82003-09-16 20:30:58 +0000516
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000517 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000518 if (len == 0) {
519 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000520 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000521 }
Tim Petersced69f82003-09-16 20:30:58 +0000522 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000523 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000524
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000525 return v;
526
527 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000528 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000529}
530
531PyObject *PyUnicode_Decode(const char *s,
532 int size,
533 const char *encoding,
534 const char *errors)
535{
536 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000537
538 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000539 encoding = PyUnicode_GetDefaultEncoding();
540
541 /* Shortcuts for common default encodings */
542 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000544 else if (strcmp(encoding, "latin-1") == 0)
545 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000546#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
547 else if (strcmp(encoding, "mbcs") == 0)
548 return PyUnicode_DecodeMBCS(s, size, errors);
549#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000550 else if (strcmp(encoding, "ascii") == 0)
551 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000552
553 /* Decode via the codec registry */
554 buffer = PyBuffer_FromMemory((void *)s, size);
555 if (buffer == NULL)
556 goto onError;
557 unicode = PyCodec_Decode(buffer, encoding, errors);
558 if (unicode == NULL)
559 goto onError;
560 if (!PyUnicode_Check(unicode)) {
561 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000562 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000563 unicode->ob_type->tp_name);
564 Py_DECREF(unicode);
565 goto onError;
566 }
567 Py_DECREF(buffer);
568 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000569
Guido van Rossumd57fd912000-03-10 22:53:23 +0000570 onError:
571 Py_XDECREF(buffer);
572 return NULL;
573}
574
575PyObject *PyUnicode_Encode(const Py_UNICODE *s,
576 int size,
577 const char *encoding,
578 const char *errors)
579{
580 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000581
Guido van Rossumd57fd912000-03-10 22:53:23 +0000582 unicode = PyUnicode_FromUnicode(s, size);
583 if (unicode == NULL)
584 return NULL;
585 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
586 Py_DECREF(unicode);
587 return v;
588}
589
590PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
591 const char *encoding,
592 const char *errors)
593{
594 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596 if (!PyUnicode_Check(unicode)) {
597 PyErr_BadArgument();
598 goto onError;
599 }
Fred Drakee4315f52000-05-09 19:53:39 +0000600
Tim Petersced69f82003-09-16 20:30:58 +0000601 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000602 encoding = PyUnicode_GetDefaultEncoding();
603
604 /* Shortcuts for common default encodings */
605 if (errors == NULL) {
606 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000607 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000608 else if (strcmp(encoding, "latin-1") == 0)
609 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000610#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
611 else if (strcmp(encoding, "mbcs") == 0)
612 return PyUnicode_AsMBCSString(unicode);
613#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000614 else if (strcmp(encoding, "ascii") == 0)
615 return PyUnicode_AsASCIIString(unicode);
616 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617
618 /* Encode via the codec registry */
619 v = PyCodec_Encode(unicode, encoding, errors);
620 if (v == NULL)
621 goto onError;
622 /* XXX Should we really enforce this ? */
623 if (!PyString_Check(v)) {
624 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000625 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 v->ob_type->tp_name);
627 Py_DECREF(v);
628 goto onError;
629 }
630 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000631
Guido van Rossumd57fd912000-03-10 22:53:23 +0000632 onError:
633 return NULL;
634}
635
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000636PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
637 const char *errors)
638{
639 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
640
641 if (v)
642 return v;
643 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
644 if (v && errors == NULL)
645 ((PyUnicodeObject *)unicode)->defenc = v;
646 return v;
647}
648
Guido van Rossumd57fd912000-03-10 22:53:23 +0000649Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
650{
651 if (!PyUnicode_Check(unicode)) {
652 PyErr_BadArgument();
653 goto onError;
654 }
655 return PyUnicode_AS_UNICODE(unicode);
656
657 onError:
658 return NULL;
659}
660
661int PyUnicode_GetSize(PyObject *unicode)
662{
663 if (!PyUnicode_Check(unicode)) {
664 PyErr_BadArgument();
665 goto onError;
666 }
667 return PyUnicode_GET_SIZE(unicode);
668
669 onError:
670 return -1;
671}
672
Thomas Wouters78890102000-07-22 19:25:51 +0000673const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000674{
675 return unicode_default_encoding;
676}
677
678int PyUnicode_SetDefaultEncoding(const char *encoding)
679{
680 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000681
Fred Drakee4315f52000-05-09 19:53:39 +0000682 /* Make sure the encoding is valid. As side effect, this also
683 loads the encoding into the codec registry cache. */
684 v = _PyCodec_Lookup(encoding);
685 if (v == NULL)
686 goto onError;
687 Py_DECREF(v);
688 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000689 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000690 sizeof(unicode_default_encoding));
691 return 0;
692
693 onError:
694 return -1;
695}
696
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000697/* error handling callback helper:
698 build arguments, call the callback and check the arguments,
699 if no exception occured, copy the replacement to the output
700 and adjust various state variables.
701 return 0 on success, -1 on error
702*/
703
704static
705int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
706 const char *encoding, const char *reason,
707 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
708 PyObject **output, int *outpos, Py_UNICODE **outptr)
709{
710 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
711
712 PyObject *restuple = NULL;
713 PyObject *repunicode = NULL;
714 int outsize = PyUnicode_GET_SIZE(*output);
715 int requiredsize;
716 int newpos;
717 Py_UNICODE *repptr;
718 int repsize;
719 int res = -1;
720
721 if (*errorHandler == NULL) {
722 *errorHandler = PyCodec_LookupError(errors);
723 if (*errorHandler == NULL)
724 goto onError;
725 }
726
727 if (*exceptionObject == NULL) {
728 *exceptionObject = PyUnicodeDecodeError_Create(
729 encoding, input, insize, *startinpos, *endinpos, reason);
730 if (*exceptionObject == NULL)
731 goto onError;
732 }
733 else {
734 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
735 goto onError;
736 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
737 goto onError;
738 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
739 goto onError;
740 }
741
742 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
743 if (restuple == NULL)
744 goto onError;
745 if (!PyTuple_Check(restuple)) {
746 PyErr_Format(PyExc_TypeError, &argparse[4]);
747 goto onError;
748 }
749 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
750 goto onError;
751 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000752 newpos = insize+newpos;
753 if (newpos<0 || newpos>insize) {
754 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
755 goto onError;
756 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000757
758 /* need more space? (at least enough for what we
759 have+the replacement+the rest of the string (starting
760 at the new input position), so we won't have to check space
761 when there are no errors in the rest of the string) */
762 repptr = PyUnicode_AS_UNICODE(repunicode);
763 repsize = PyUnicode_GET_SIZE(repunicode);
764 requiredsize = *outpos + repsize + insize-newpos;
765 if (requiredsize > outsize) {
766 if (requiredsize<2*outsize)
767 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000768 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000769 goto onError;
770 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
771 }
772 *endinpos = newpos;
773 *inptr = input + newpos;
774 Py_UNICODE_COPY(*outptr, repptr, repsize);
775 *outptr += repsize;
776 *outpos += repsize;
777 /* we made it! */
778 res = 0;
779
780 onError:
781 Py_XDECREF(restuple);
782 return res;
783}
784
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000785/* --- UTF-7 Codec -------------------------------------------------------- */
786
787/* see RFC2152 for details */
788
Tim Petersced69f82003-09-16 20:30:58 +0000789static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000790char utf7_special[128] = {
791 /* indicate whether a UTF-7 character is special i.e. cannot be directly
792 encoded:
793 0 - not special
794 1 - special
795 2 - whitespace (optional)
796 3 - RFC2152 Set O (optional) */
797 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
798 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
799 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
800 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
801 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
802 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
803 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
804 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
805
806};
807
808#define SPECIAL(c, encodeO, encodeWS) \
809 (((c)>127 || utf7_special[(c)] == 1) || \
810 (encodeWS && (utf7_special[(c)] == 2)) || \
811 (encodeO && (utf7_special[(c)] == 3)))
812
813#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
814#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
815#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
816 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
817
818#define ENCODE(out, ch, bits) \
819 while (bits >= 6) { \
820 *out++ = B64(ch >> (bits-6)); \
821 bits -= 6; \
822 }
823
824#define DECODE(out, ch, bits, surrogate) \
825 while (bits >= 16) { \
826 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
827 bits -= 16; \
828 if (surrogate) { \
829 /* We have already generated an error for the high surrogate
830 so let's not bother seeing if the low surrogate is correct or not */\
831 surrogate = 0; \
832 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
833 /* This is a surrogate pair. Unfortunately we can't represent \
834 it in a 16-bit character */ \
835 surrogate = 1; \
836 errmsg = "code pairs are not supported"; \
837 goto utf7Error; \
838 } else { \
839 *out++ = outCh; \
840 } \
841 } \
842
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000843PyObject *PyUnicode_DecodeUTF7(const char *s,
844 int size,
845 const char *errors)
846{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000847 const char *starts = s;
848 int startinpos;
849 int endinpos;
850 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000851 const char *e;
852 PyUnicodeObject *unicode;
853 Py_UNICODE *p;
854 const char *errmsg = "";
855 int inShift = 0;
856 unsigned int bitsleft = 0;
857 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000858 int surrogate = 0;
859 PyObject *errorHandler = NULL;
860 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000861
862 unicode = _PyUnicode_New(size);
863 if (!unicode)
864 return NULL;
865 if (size == 0)
866 return (PyObject *)unicode;
867
868 p = unicode->str;
869 e = s + size;
870
871 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000872 Py_UNICODE ch;
873 restart:
874 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000875
876 if (inShift) {
877 if ((ch == '-') || !B64CHAR(ch)) {
878 inShift = 0;
879 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000880
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
882 if (bitsleft >= 6) {
883 /* The shift sequence has a partial character in it. If
884 bitsleft < 6 then we could just classify it as padding
885 but that is not the case here */
886
887 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000888 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000889 }
890 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000891 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000892 here so indicate the potential of a misencoded character. */
893
894 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
895 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
896 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000897 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000898 }
899
900 if (ch == '-') {
901 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000902 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000903 inShift = 1;
904 }
905 } else if (SPECIAL(ch,0,0)) {
906 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000907 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000908 } else {
909 *p++ = ch;
910 }
911 } else {
912 charsleft = (charsleft << 6) | UB64(ch);
913 bitsleft += 6;
914 s++;
915 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
916 }
917 }
918 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000919 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000920 s++;
921 if (s < e && *s == '-') {
922 s++;
923 *p++ = '+';
924 } else
925 {
926 inShift = 1;
927 bitsleft = 0;
928 }
929 }
930 else if (SPECIAL(ch,0,0)) {
931 errmsg = "unexpected special character";
932 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000933 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000934 }
935 else {
936 *p++ = ch;
937 s++;
938 }
939 continue;
940 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000941 outpos = p-PyUnicode_AS_UNICODE(unicode);
942 endinpos = s-starts;
943 if (unicode_decode_call_errorhandler(
944 errors, &errorHandler,
945 "utf7", errmsg,
946 starts, size, &startinpos, &endinpos, &exc, &s,
947 (PyObject **)&unicode, &outpos, &p))
948 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 }
950
951 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000952 outpos = p-PyUnicode_AS_UNICODE(unicode);
953 endinpos = size;
954 if (unicode_decode_call_errorhandler(
955 errors, &errorHandler,
956 "utf7", "unterminated shift sequence",
957 starts, size, &startinpos, &endinpos, &exc, &s,
958 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000959 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000960 if (s < e)
961 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000962 }
963
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000964 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000965 goto onError;
966
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000967 Py_XDECREF(errorHandler);
968 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000969 return (PyObject *)unicode;
970
971onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000972 Py_XDECREF(errorHandler);
973 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974 Py_DECREF(unicode);
975 return NULL;
976}
977
978
979PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
980 int size,
981 int encodeSetO,
982 int encodeWhiteSpace,
983 const char *errors)
984{
985 PyObject *v;
986 /* It might be possible to tighten this worst case */
987 unsigned int cbAllocated = 5 * size;
988 int inShift = 0;
989 int i = 0;
990 unsigned int bitsleft = 0;
991 unsigned long charsleft = 0;
992 char * out;
993 char * start;
994
995 if (size == 0)
996 return PyString_FromStringAndSize(NULL, 0);
997
998 v = PyString_FromStringAndSize(NULL, cbAllocated);
999 if (v == NULL)
1000 return NULL;
1001
1002 start = out = PyString_AS_STRING(v);
1003 for (;i < size; ++i) {
1004 Py_UNICODE ch = s[i];
1005
1006 if (!inShift) {
1007 if (ch == '+') {
1008 *out++ = '+';
1009 *out++ = '-';
1010 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1011 charsleft = ch;
1012 bitsleft = 16;
1013 *out++ = '+';
1014 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1015 inShift = bitsleft > 0;
1016 } else {
1017 *out++ = (char) ch;
1018 }
1019 } else {
1020 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1021 *out++ = B64(charsleft << (6-bitsleft));
1022 charsleft = 0;
1023 bitsleft = 0;
1024 /* Characters not in the BASE64 set implicitly unshift the sequence
1025 so no '-' is required, except if the character is itself a '-' */
1026 if (B64CHAR(ch) || ch == '-') {
1027 *out++ = '-';
1028 }
1029 inShift = 0;
1030 *out++ = (char) ch;
1031 } else {
1032 bitsleft += 16;
1033 charsleft = (charsleft << 16) | ch;
1034 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1035
1036 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001037 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001038 or '-' then the shift sequence will be terminated implicitly and we
1039 don't have to insert a '-'. */
1040
1041 if (bitsleft == 0) {
1042 if (i + 1 < size) {
1043 Py_UNICODE ch2 = s[i+1];
1044
1045 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001046
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001047 } else if (B64CHAR(ch2) || ch2 == '-') {
1048 *out++ = '-';
1049 inShift = 0;
1050 } else {
1051 inShift = 0;
1052 }
1053
1054 }
1055 else {
1056 *out++ = '-';
1057 inShift = 0;
1058 }
1059 }
Tim Petersced69f82003-09-16 20:30:58 +00001060 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062 }
1063 if (bitsleft) {
1064 *out++= B64(charsleft << (6-bitsleft) );
1065 *out++ = '-';
1066 }
1067
Tim Peters5de98422002-04-27 18:44:32 +00001068 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001069 return v;
1070}
1071
1072#undef SPECIAL
1073#undef B64
1074#undef B64CHAR
1075#undef UB64
1076#undef ENCODE
1077#undef DECODE
1078
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079/* --- UTF-8 Codec -------------------------------------------------------- */
1080
Tim Petersced69f82003-09-16 20:30:58 +00001081static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001082char utf8_code_length[256] = {
1083 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1084 illegal prefix. see RFC 2279 for details */
1085 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1086 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1087 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1088 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1089 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1090 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1091 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1092 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1093 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1094 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1095 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1096 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1097 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1098 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1099 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1100 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1101};
1102
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103PyObject *PyUnicode_DecodeUTF8(const char *s,
1104 int size,
1105 const char *errors)
1106{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001107 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001109 int startinpos;
1110 int endinpos;
1111 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 const char *e;
1113 PyUnicodeObject *unicode;
1114 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001115 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001116 PyObject *errorHandler = NULL;
1117 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001118
1119 /* Note: size will always be longer than the resulting Unicode
1120 character count */
1121 unicode = _PyUnicode_New(size);
1122 if (!unicode)
1123 return NULL;
1124 if (size == 0)
1125 return (PyObject *)unicode;
1126
1127 /* Unpack UTF-8 encoded data */
1128 p = unicode->str;
1129 e = s + size;
1130
1131 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001132 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001133
1134 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001135 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001136 s++;
1137 continue;
1138 }
1139
1140 n = utf8_code_length[ch];
1141
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001142 if (s + n > e) {
1143 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001144 startinpos = s-starts;
1145 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001146 goto utf8Error;
1147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148
1149 switch (n) {
1150
1151 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001152 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001153 startinpos = s-starts;
1154 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001155 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156
1157 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001158 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001159 startinpos = s-starts;
1160 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001161 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162
1163 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001164 if ((s[1] & 0xc0) != 0x80) {
1165 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001166 startinpos = s-starts;
1167 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001168 goto utf8Error;
1169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001171 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001172 startinpos = s-starts;
1173 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001174 errmsg = "illegal encoding";
1175 goto utf8Error;
1176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001178 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 break;
1180
1181 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001182 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001183 (s[2] & 0xc0) != 0x80) {
1184 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001185 startinpos = s-starts;
1186 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001187 goto utf8Error;
1188 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001190 if (ch < 0x0800) {
1191 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001192 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001193
1194 XXX For wide builds (UCS-4) we should probably try
1195 to recombine the surrogates into a single code
1196 unit.
1197 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001198 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001199 startinpos = s-starts;
1200 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001201 goto utf8Error;
1202 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001204 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001205 break;
1206
1207 case 4:
1208 if ((s[1] & 0xc0) != 0x80 ||
1209 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001210 (s[3] & 0xc0) != 0x80) {
1211 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001212 startinpos = s-starts;
1213 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001214 goto utf8Error;
1215 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001216 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1217 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1218 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001219 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001220 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001221 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001222 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001223 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001224 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001225 startinpos = s-starts;
1226 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001227 goto utf8Error;
1228 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001229#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001230 *p++ = (Py_UNICODE)ch;
1231#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001232 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001233
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001234 /* translate from 10000..10FFFF to 0..FFFF */
1235 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001236
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001237 /* high surrogate = top 10 bits added to D800 */
1238 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001239
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001240 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001241 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001242#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 break;
1244
1245 default:
1246 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001247 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001248 startinpos = s-starts;
1249 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001250 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 }
1252 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001253 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001254
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001255 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001256 outpos = p-PyUnicode_AS_UNICODE(unicode);
1257 if (unicode_decode_call_errorhandler(
1258 errors, &errorHandler,
1259 "utf8", errmsg,
1260 starts, size, &startinpos, &endinpos, &exc, &s,
1261 (PyObject **)&unicode, &outpos, &p))
1262 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001263 }
1264
1265 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001266 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267 goto onError;
1268
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001269 Py_XDECREF(errorHandler);
1270 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001271 return (PyObject *)unicode;
1272
1273onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001274 Py_XDECREF(errorHandler);
1275 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 Py_DECREF(unicode);
1277 return NULL;
1278}
1279
Tim Peters602f7402002-04-27 18:03:26 +00001280/* Allocation strategy: if the string is short, convert into a stack buffer
1281 and allocate exactly as much space needed at the end. Else allocate the
1282 maximum possible needed (4 result bytes per Unicode character), and return
1283 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001284*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001285PyObject *
1286PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1287 int size,
1288 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289{
Tim Peters602f7402002-04-27 18:03:26 +00001290#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001291
Tim Peters602f7402002-04-27 18:03:26 +00001292 int i; /* index into s of next input byte */
1293 PyObject *v; /* result string object */
1294 char *p; /* next free byte in output buffer */
1295 int nallocated; /* number of result bytes allocated */
1296 int nneeded; /* number of result bytes needed */
1297 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001298
Tim Peters602f7402002-04-27 18:03:26 +00001299 assert(s != NULL);
1300 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001301
Tim Peters602f7402002-04-27 18:03:26 +00001302 if (size <= MAX_SHORT_UNICHARS) {
1303 /* Write into the stack buffer; nallocated can't overflow.
1304 * At the end, we'll allocate exactly as much heap space as it
1305 * turns out we need.
1306 */
1307 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1308 v = NULL; /* will allocate after we're done */
1309 p = stackbuf;
1310 }
1311 else {
1312 /* Overallocate on the heap, and give the excess back at the end. */
1313 nallocated = size * 4;
1314 if (nallocated / 4 != size) /* overflow! */
1315 return PyErr_NoMemory();
1316 v = PyString_FromStringAndSize(NULL, nallocated);
1317 if (v == NULL)
1318 return NULL;
1319 p = PyString_AS_STRING(v);
1320 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001321
Tim Peters602f7402002-04-27 18:03:26 +00001322 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001323 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001324
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001325 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001326 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001328
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001330 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001331 *p++ = (char)(0xc0 | (ch >> 6));
1332 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001333 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001334 else {
Tim Peters602f7402002-04-27 18:03:26 +00001335 /* Encode UCS2 Unicode ordinals */
1336 if (ch < 0x10000) {
1337 /* Special case: check for high surrogate */
1338 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1339 Py_UCS4 ch2 = s[i];
1340 /* Check for low surrogate and combine the two to
1341 form a UCS4 value */
1342 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001343 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001344 i++;
1345 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001346 }
Tim Peters602f7402002-04-27 18:03:26 +00001347 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001349 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001350 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1351 *p++ = (char)(0x80 | (ch & 0x3f));
1352 continue;
1353 }
1354encodeUCS4:
1355 /* Encode UCS4 Unicode ordinals */
1356 *p++ = (char)(0xf0 | (ch >> 18));
1357 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1358 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1359 *p++ = (char)(0x80 | (ch & 0x3f));
1360 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001362
Tim Peters602f7402002-04-27 18:03:26 +00001363 if (v == NULL) {
1364 /* This was stack allocated. */
1365 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1366 assert(nneeded <= nallocated);
1367 v = PyString_FromStringAndSize(stackbuf, nneeded);
1368 }
1369 else {
1370 /* Cut back to size actually needed. */
1371 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1372 assert(nneeded <= nallocated);
1373 _PyString_Resize(&v, nneeded);
1374 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001375 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001376
Tim Peters602f7402002-04-27 18:03:26 +00001377#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378}
1379
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1381{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 if (!PyUnicode_Check(unicode)) {
1383 PyErr_BadArgument();
1384 return NULL;
1385 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001386 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1387 PyUnicode_GET_SIZE(unicode),
1388 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389}
1390
1391/* --- UTF-16 Codec ------------------------------------------------------- */
1392
Tim Peters772747b2001-08-09 22:21:55 +00001393PyObject *
1394PyUnicode_DecodeUTF16(const char *s,
1395 int size,
1396 const char *errors,
1397 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001399 const char *starts = s;
1400 int startinpos;
1401 int endinpos;
1402 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001403 PyUnicodeObject *unicode;
1404 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001405 const unsigned char *q, *e;
1406 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001407 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001408 /* Offsets from q for retrieving byte pairs in the right order. */
1409#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1410 int ihi = 1, ilo = 0;
1411#else
1412 int ihi = 0, ilo = 1;
1413#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414 PyObject *errorHandler = NULL;
1415 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001416
1417 /* Note: size will always be longer than the resulting Unicode
1418 character count */
1419 unicode = _PyUnicode_New(size);
1420 if (!unicode)
1421 return NULL;
1422 if (size == 0)
1423 return (PyObject *)unicode;
1424
1425 /* Unpack UTF-16 encoded data */
1426 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001427 q = (unsigned char *)s;
1428 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429
1430 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001431 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001433 /* Check for BOM marks (U+FEFF) in the input and adjust current
1434 byte order setting accordingly. In native mode, the leading BOM
1435 mark is skipped, in all other modes, it is copied to the output
1436 stream as-is (giving a ZWNBSP character). */
1437 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001438 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001439#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001440 if (bom == 0xFEFF) {
1441 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001442 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001443 }
1444 else if (bom == 0xFFFE) {
1445 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001446 bo = 1;
1447 }
Tim Petersced69f82003-09-16 20:30:58 +00001448#else
Tim Peters772747b2001-08-09 22:21:55 +00001449 if (bom == 0xFEFF) {
1450 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001451 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001452 }
1453 else if (bom == 0xFFFE) {
1454 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001455 bo = -1;
1456 }
1457#endif
1458 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459
Tim Peters772747b2001-08-09 22:21:55 +00001460 if (bo == -1) {
1461 /* force LE */
1462 ihi = 1;
1463 ilo = 0;
1464 }
1465 else if (bo == 1) {
1466 /* force BE */
1467 ihi = 0;
1468 ilo = 1;
1469 }
1470
1471 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001472 Py_UNICODE ch;
1473 /* remaing bytes at the end? (size should be even) */
1474 if (e-q<2) {
1475 errmsg = "truncated data";
1476 startinpos = ((const char *)q)-starts;
1477 endinpos = ((const char *)e)-starts;
1478 goto utf16Error;
1479 /* The remaining input chars are ignored if the callback
1480 chooses to skip the input */
1481 }
1482 ch = (q[ihi] << 8) | q[ilo];
1483
Tim Peters772747b2001-08-09 22:21:55 +00001484 q += 2;
1485
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486 if (ch < 0xD800 || ch > 0xDFFF) {
1487 *p++ = ch;
1488 continue;
1489 }
1490
1491 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001492 if (q >= e) {
1493 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001494 startinpos = (((const char *)q)-2)-starts;
1495 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001496 goto utf16Error;
1497 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001498 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001499 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1500 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001501 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001502#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001503 *p++ = ch;
1504 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001505#else
1506 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001507#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001508 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001509 }
1510 else {
1511 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001512 startinpos = (((const char *)q)-4)-starts;
1513 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001514 goto utf16Error;
1515 }
1516
Guido van Rossumd57fd912000-03-10 22:53:23 +00001517 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001518 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001519 startinpos = (((const char *)q)-2)-starts;
1520 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001521 /* Fall through to report the error */
1522
1523 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001524 outpos = p-PyUnicode_AS_UNICODE(unicode);
1525 if (unicode_decode_call_errorhandler(
1526 errors, &errorHandler,
1527 "utf16", errmsg,
1528 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1529 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001530 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001531 }
1532
1533 if (byteorder)
1534 *byteorder = bo;
1535
1536 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001537 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001538 goto onError;
1539
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 Py_XDECREF(errorHandler);
1541 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 return (PyObject *)unicode;
1543
1544onError:
1545 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001546 Py_XDECREF(errorHandler);
1547 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001548 return NULL;
1549}
1550
Tim Peters772747b2001-08-09 22:21:55 +00001551PyObject *
1552PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1553 int size,
1554 const char *errors,
1555 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001556{
1557 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001558 unsigned char *p;
1559 int i, pairs;
1560 /* Offsets from p for storing byte pairs in the right order. */
1561#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1562 int ihi = 1, ilo = 0;
1563#else
1564 int ihi = 0, ilo = 1;
1565#endif
1566
1567#define STORECHAR(CH) \
1568 do { \
1569 p[ihi] = ((CH) >> 8) & 0xff; \
1570 p[ilo] = (CH) & 0xff; \
1571 p += 2; \
1572 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001573
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001574 for (i = pairs = 0; i < size; i++)
1575 if (s[i] >= 0x10000)
1576 pairs++;
Tim Petersced69f82003-09-16 20:30:58 +00001577 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001578 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001579 if (v == NULL)
1580 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001581
Tim Peters772747b2001-08-09 22:21:55 +00001582 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001583 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001584 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001585 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001586 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001587
1588 if (byteorder == -1) {
1589 /* force LE */
1590 ihi = 1;
1591 ilo = 0;
1592 }
1593 else if (byteorder == 1) {
1594 /* force BE */
1595 ihi = 0;
1596 ilo = 1;
1597 }
1598
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001599 while (size-- > 0) {
1600 Py_UNICODE ch = *s++;
1601 Py_UNICODE ch2 = 0;
1602 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001603 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1604 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001605 }
Tim Peters772747b2001-08-09 22:21:55 +00001606 STORECHAR(ch);
1607 if (ch2)
1608 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001610 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001611#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001612}
1613
1614PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1615{
1616 if (!PyUnicode_Check(unicode)) {
1617 PyErr_BadArgument();
1618 return NULL;
1619 }
1620 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1621 PyUnicode_GET_SIZE(unicode),
1622 NULL,
1623 0);
1624}
1625
1626/* --- Unicode Escape Codec ----------------------------------------------- */
1627
Fredrik Lundh06d12682001-01-24 07:59:11 +00001628static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001629
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1631 int size,
1632 const char *errors)
1633{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001634 const char *starts = s;
1635 int startinpos;
1636 int endinpos;
1637 int outpos;
1638 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001639 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001640 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001641 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001642 char* message;
1643 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001644 PyObject *errorHandler = NULL;
1645 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001646
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 /* Escaped strings will always be longer than the resulting
1648 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 length after conversion to the true value.
1650 (but if the error callback returns a long replacement string
1651 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652 v = _PyUnicode_New(size);
1653 if (v == NULL)
1654 goto onError;
1655 if (size == 0)
1656 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001657
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001658 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001659 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001660
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 while (s < end) {
1662 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001663 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001664 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665
1666 /* Non-escape characters are interpreted as Unicode ordinals */
1667 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001668 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 continue;
1670 }
1671
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001672 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 /* \ - Escapes */
1674 s++;
1675 switch (*s++) {
1676
1677 /* \x escapes */
1678 case '\n': break;
1679 case '\\': *p++ = '\\'; break;
1680 case '\'': *p++ = '\''; break;
1681 case '\"': *p++ = '\"'; break;
1682 case 'b': *p++ = '\b'; break;
1683 case 'f': *p++ = '\014'; break; /* FF */
1684 case 't': *p++ = '\t'; break;
1685 case 'n': *p++ = '\n'; break;
1686 case 'r': *p++ = '\r'; break;
1687 case 'v': *p++ = '\013'; break; /* VT */
1688 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1689
1690 /* \OOO (octal) escapes */
1691 case '0': case '1': case '2': case '3':
1692 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001693 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001695 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001697 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001699 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700 break;
1701
Fredrik Lundhccc74732001-02-18 22:13:49 +00001702 /* hex escapes */
1703 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001705 digits = 2;
1706 message = "truncated \\xXX escape";
1707 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708
Fredrik Lundhccc74732001-02-18 22:13:49 +00001709 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001711 digits = 4;
1712 message = "truncated \\uXXXX escape";
1713 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714
Fredrik Lundhccc74732001-02-18 22:13:49 +00001715 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001716 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001717 digits = 8;
1718 message = "truncated \\UXXXXXXXX escape";
1719 hexescape:
1720 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001721 outpos = p-PyUnicode_AS_UNICODE(v);
1722 if (s+digits>end) {
1723 endinpos = size;
1724 if (unicode_decode_call_errorhandler(
1725 errors, &errorHandler,
1726 "unicodeescape", "end of string in escape sequence",
1727 starts, size, &startinpos, &endinpos, &exc, &s,
1728 (PyObject **)&v, &outpos, &p))
1729 goto onError;
1730 goto nextByte;
1731 }
1732 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001733 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001734 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001735 endinpos = (s+i+1)-starts;
1736 if (unicode_decode_call_errorhandler(
1737 errors, &errorHandler,
1738 "unicodeescape", message,
1739 starts, size, &startinpos, &endinpos, &exc, &s,
1740 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001741 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001743 }
1744 chr = (chr<<4) & ~0xF;
1745 if (c >= '0' && c <= '9')
1746 chr += c - '0';
1747 else if (c >= 'a' && c <= 'f')
1748 chr += 10 + c - 'a';
1749 else
1750 chr += 10 + c - 'A';
1751 }
1752 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001753 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001754 /* _decoding_error will have already written into the
1755 target buffer. */
1756 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001757 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001758 /* when we get here, chr is a 32-bit unicode character */
1759 if (chr <= 0xffff)
1760 /* UCS-2 character */
1761 *p++ = (Py_UNICODE) chr;
1762 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001763 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001764 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001765#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001766 *p++ = chr;
1767#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001768 chr -= 0x10000L;
1769 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001770 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001771#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001772 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001773 endinpos = s-starts;
1774 outpos = p-PyUnicode_AS_UNICODE(v);
1775 if (unicode_decode_call_errorhandler(
1776 errors, &errorHandler,
1777 "unicodeescape", "illegal Unicode character",
1778 starts, size, &startinpos, &endinpos, &exc, &s,
1779 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001780 goto onError;
1781 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001782 break;
1783
1784 /* \N{name} */
1785 case 'N':
1786 message = "malformed \\N character escape";
1787 if (ucnhash_CAPI == NULL) {
1788 /* load the unicode data module */
1789 PyObject *m, *v;
1790 m = PyImport_ImportModule("unicodedata");
1791 if (m == NULL)
1792 goto ucnhashError;
1793 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1794 Py_DECREF(m);
1795 if (v == NULL)
1796 goto ucnhashError;
1797 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1798 Py_DECREF(v);
1799 if (ucnhash_CAPI == NULL)
1800 goto ucnhashError;
1801 }
1802 if (*s == '{') {
1803 const char *start = s+1;
1804 /* look for the closing brace */
1805 while (*s != '}' && s < end)
1806 s++;
1807 if (s > start && s < end && *s == '}') {
1808 /* found a name. look it up in the unicode database */
1809 message = "unknown Unicode character name";
1810 s++;
1811 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1812 goto store;
1813 }
1814 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001815 endinpos = s-starts;
1816 outpos = p-PyUnicode_AS_UNICODE(v);
1817 if (unicode_decode_call_errorhandler(
1818 errors, &errorHandler,
1819 "unicodeescape", message,
1820 starts, size, &startinpos, &endinpos, &exc, &s,
1821 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001822 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001823 break;
1824
1825 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001826 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001827 message = "\\ at end of string";
1828 s--;
1829 endinpos = s-starts;
1830 outpos = p-PyUnicode_AS_UNICODE(v);
1831 if (unicode_decode_call_errorhandler(
1832 errors, &errorHandler,
1833 "unicodeescape", message,
1834 starts, size, &startinpos, &endinpos, &exc, &s,
1835 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001836 goto onError;
1837 }
1838 else {
1839 *p++ = '\\';
1840 *p++ = (unsigned char)s[-1];
1841 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001842 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001843 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001844 nextByte:
1845 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001847 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001848 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001849 Py_XDECREF(errorHandler);
1850 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001852
Fredrik Lundhccc74732001-02-18 22:13:49 +00001853ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001854 PyErr_SetString(
1855 PyExc_UnicodeError,
1856 "\\N escapes not supported (can't load unicodedata module)"
1857 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 Py_XDECREF(errorHandler);
1859 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001860 return NULL;
1861
Fredrik Lundhccc74732001-02-18 22:13:49 +00001862onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001863 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001864 Py_XDECREF(errorHandler);
1865 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866 return NULL;
1867}
1868
1869/* Return a Unicode-Escape string version of the Unicode object.
1870
1871 If quotes is true, the string is enclosed in u"" or u'' quotes as
1872 appropriate.
1873
1874*/
1875
Barry Warsaw51ac5802000-03-20 16:36:48 +00001876static const Py_UNICODE *findchar(const Py_UNICODE *s,
1877 int size,
1878 Py_UNICODE ch);
1879
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880static
1881PyObject *unicodeescape_string(const Py_UNICODE *s,
1882 int size,
1883 int quotes)
1884{
1885 PyObject *repr;
1886 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001887
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001888 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889
1890 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1891 if (repr == NULL)
1892 return NULL;
1893
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001894 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895
1896 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001897 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001898 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 !findchar(s, size, '"')) ? '"' : '\'';
1900 }
1901 while (size-- > 0) {
1902 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001903
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001905 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001906 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001907 *p++ = '\\';
1908 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001909 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001910 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001911
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001912#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001913 /* Map 21-bit characters to '\U00xxxxxx' */
1914 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001915 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00001916
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001917 /* Resize the string if necessary */
1918 if (offset + 12 > PyString_GET_SIZE(repr)) {
1919 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001920 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001921 p = PyString_AS_STRING(repr) + offset;
1922 }
1923
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001924 *p++ = '\\';
1925 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001926 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1927 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1928 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1929 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1930 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1931 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1932 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001933 *p++ = hexdigit[ch & 0x0000000F];
1934 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001935 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001936#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001937 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1938 else if (ch >= 0xD800 && ch < 0xDC00) {
1939 Py_UNICODE ch2;
1940 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00001941
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001942 ch2 = *s++;
1943 size--;
1944 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1945 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1946 *p++ = '\\';
1947 *p++ = 'U';
1948 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1949 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1950 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1951 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1952 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1953 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1954 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1955 *p++ = hexdigit[ucs & 0x0000000F];
1956 continue;
1957 }
1958 /* Fall through: isolated surrogates are copied as-is */
1959 s--;
1960 size++;
1961 }
1962
Guido van Rossumd57fd912000-03-10 22:53:23 +00001963 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001964 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001965 *p++ = '\\';
1966 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001967 *p++ = hexdigit[(ch >> 12) & 0x000F];
1968 *p++ = hexdigit[(ch >> 8) & 0x000F];
1969 *p++ = hexdigit[(ch >> 4) & 0x000F];
1970 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001972
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001973 /* Map special whitespace to '\t', \n', '\r' */
1974 else if (ch == '\t') {
1975 *p++ = '\\';
1976 *p++ = 't';
1977 }
1978 else if (ch == '\n') {
1979 *p++ = '\\';
1980 *p++ = 'n';
1981 }
1982 else if (ch == '\r') {
1983 *p++ = '\\';
1984 *p++ = 'r';
1985 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001986
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001987 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001988 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001990 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001991 *p++ = hexdigit[(ch >> 4) & 0x000F];
1992 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00001993 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001994
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995 /* Copy everything else as-is */
1996 else
1997 *p++ = (char) ch;
1998 }
1999 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002000 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001
2002 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002003 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002004 return repr;
2005}
2006
2007PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2008 int size)
2009{
2010 return unicodeescape_string(s, size, 0);
2011}
2012
2013PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2014{
2015 if (!PyUnicode_Check(unicode)) {
2016 PyErr_BadArgument();
2017 return NULL;
2018 }
2019 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2020 PyUnicode_GET_SIZE(unicode));
2021}
2022
2023/* --- Raw Unicode Escape Codec ------------------------------------------- */
2024
2025PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2026 int size,
2027 const char *errors)
2028{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002029 const char *starts = s;
2030 int startinpos;
2031 int endinpos;
2032 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002034 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 const char *end;
2036 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002037 PyObject *errorHandler = NULL;
2038 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002039
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040 /* Escaped strings will always be longer than the resulting
2041 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002042 length after conversion to the true value. (But decoding error
2043 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 v = _PyUnicode_New(size);
2045 if (v == NULL)
2046 goto onError;
2047 if (size == 0)
2048 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002049 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 end = s + size;
2051 while (s < end) {
2052 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002053 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002055 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056
2057 /* Non-escape characters are interpreted as Unicode ordinals */
2058 if (*s != '\\') {
2059 *p++ = (unsigned char)*s++;
2060 continue;
2061 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002062 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063
2064 /* \u-escapes are only interpreted iff the number of leading
2065 backslashes if odd */
2066 bs = s;
2067 for (;s < end;) {
2068 if (*s != '\\')
2069 break;
2070 *p++ = (unsigned char)*s++;
2071 }
2072 if (((s - bs) & 1) == 0 ||
2073 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002074 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075 continue;
2076 }
2077 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002078 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079 s++;
2080
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002081 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002082 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002083 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002084 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002086 endinpos = s-starts;
2087 if (unicode_decode_call_errorhandler(
2088 errors, &errorHandler,
2089 "rawunicodeescape", "truncated \\uXXXX",
2090 starts, size, &startinpos, &endinpos, &exc, &s,
2091 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002093 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 }
2095 x = (x<<4) & ~0xF;
2096 if (c >= '0' && c <= '9')
2097 x += c - '0';
2098 else if (c >= 'a' && c <= 'f')
2099 x += 10 + c - 'a';
2100 else
2101 x += 10 + c - 'A';
2102 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002103#ifndef Py_UNICODE_WIDE
2104 if (x > 0x10000) {
2105 if (unicode_decode_call_errorhandler(
2106 errors, &errorHandler,
2107 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2108 starts, size, &startinpos, &endinpos, &exc, &s,
2109 (PyObject **)&v, &outpos, &p))
2110 goto onError;
2111 }
2112#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002113 *p++ = x;
2114 nextByte:
2115 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002117 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002118 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002119 Py_XDECREF(errorHandler);
2120 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002122
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 onError:
2124 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002125 Py_XDECREF(errorHandler);
2126 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127 return NULL;
2128}
2129
2130PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2131 int size)
2132{
2133 PyObject *repr;
2134 char *p;
2135 char *q;
2136
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002137 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002138
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002139#ifdef Py_UNICODE_WIDE
2140 repr = PyString_FromStringAndSize(NULL, 10 * size);
2141#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002143#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 if (repr == NULL)
2145 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002146 if (size == 0)
2147 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148
2149 p = q = PyString_AS_STRING(repr);
2150 while (size-- > 0) {
2151 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002152#ifdef Py_UNICODE_WIDE
2153 /* Map 32-bit characters to '\Uxxxxxxxx' */
2154 if (ch >= 0x10000) {
2155 *p++ = '\\';
2156 *p++ = 'U';
2157 *p++ = hexdigit[(ch >> 28) & 0xf];
2158 *p++ = hexdigit[(ch >> 24) & 0xf];
2159 *p++ = hexdigit[(ch >> 20) & 0xf];
2160 *p++ = hexdigit[(ch >> 16) & 0xf];
2161 *p++ = hexdigit[(ch >> 12) & 0xf];
2162 *p++ = hexdigit[(ch >> 8) & 0xf];
2163 *p++ = hexdigit[(ch >> 4) & 0xf];
2164 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002165 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002166 else
2167#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168 /* Map 16-bit characters to '\uxxxx' */
2169 if (ch >= 256) {
2170 *p++ = '\\';
2171 *p++ = 'u';
2172 *p++ = hexdigit[(ch >> 12) & 0xf];
2173 *p++ = hexdigit[(ch >> 8) & 0xf];
2174 *p++ = hexdigit[(ch >> 4) & 0xf];
2175 *p++ = hexdigit[ch & 15];
2176 }
2177 /* Copy everything else as-is */
2178 else
2179 *p++ = (char) ch;
2180 }
2181 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002182 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 return repr;
2184}
2185
2186PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2187{
2188 if (!PyUnicode_Check(unicode)) {
2189 PyErr_BadArgument();
2190 return NULL;
2191 }
2192 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2193 PyUnicode_GET_SIZE(unicode));
2194}
2195
2196/* --- Latin-1 Codec ------------------------------------------------------ */
2197
2198PyObject *PyUnicode_DecodeLatin1(const char *s,
2199 int size,
2200 const char *errors)
2201{
2202 PyUnicodeObject *v;
2203 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002204
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002206 if (size == 1 && *(unsigned char*)s < 256) {
2207 Py_UNICODE r = *(unsigned char*)s;
2208 return PyUnicode_FromUnicode(&r, 1);
2209 }
2210
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 v = _PyUnicode_New(size);
2212 if (v == NULL)
2213 goto onError;
2214 if (size == 0)
2215 return (PyObject *)v;
2216 p = PyUnicode_AS_UNICODE(v);
2217 while (size-- > 0)
2218 *p++ = (unsigned char)*s++;
2219 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002220
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221 onError:
2222 Py_XDECREF(v);
2223 return NULL;
2224}
2225
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002226/* create or adjust a UnicodeEncodeError */
2227static void make_encode_exception(PyObject **exceptionObject,
2228 const char *encoding,
2229 const Py_UNICODE *unicode, int size,
2230 int startpos, int endpos,
2231 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002233 if (*exceptionObject == NULL) {
2234 *exceptionObject = PyUnicodeEncodeError_Create(
2235 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236 }
2237 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002238 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2239 goto onError;
2240 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2241 goto onError;
2242 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2243 goto onError;
2244 return;
2245 onError:
2246 Py_DECREF(*exceptionObject);
2247 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 }
2249}
2250
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002251/* raises a UnicodeEncodeError */
2252static void raise_encode_exception(PyObject **exceptionObject,
2253 const char *encoding,
2254 const Py_UNICODE *unicode, int size,
2255 int startpos, int endpos,
2256 const char *reason)
2257{
2258 make_encode_exception(exceptionObject,
2259 encoding, unicode, size, startpos, endpos, reason);
2260 if (*exceptionObject != NULL)
2261 PyCodec_StrictErrors(*exceptionObject);
2262}
2263
2264/* error handling callback helper:
2265 build arguments, call the callback and check the arguments,
2266 put the result into newpos and return the replacement string, which
2267 has to be freed by the caller */
2268static PyObject *unicode_encode_call_errorhandler(const char *errors,
2269 PyObject **errorHandler,
2270 const char *encoding, const char *reason,
2271 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2272 int startpos, int endpos,
2273 int *newpos)
2274{
2275 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2276
2277 PyObject *restuple;
2278 PyObject *resunicode;
2279
2280 if (*errorHandler == NULL) {
2281 *errorHandler = PyCodec_LookupError(errors);
2282 if (*errorHandler == NULL)
2283 return NULL;
2284 }
2285
2286 make_encode_exception(exceptionObject,
2287 encoding, unicode, size, startpos, endpos, reason);
2288 if (*exceptionObject == NULL)
2289 return NULL;
2290
2291 restuple = PyObject_CallFunctionObjArgs(
2292 *errorHandler, *exceptionObject, NULL);
2293 if (restuple == NULL)
2294 return NULL;
2295 if (!PyTuple_Check(restuple)) {
2296 PyErr_Format(PyExc_TypeError, &argparse[4]);
2297 Py_DECREF(restuple);
2298 return NULL;
2299 }
2300 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2301 &resunicode, newpos)) {
2302 Py_DECREF(restuple);
2303 return NULL;
2304 }
2305 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002306 *newpos = size+*newpos;
2307 if (*newpos<0 || *newpos>size) {
2308 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2309 Py_DECREF(restuple);
2310 return NULL;
2311 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002312 Py_INCREF(resunicode);
2313 Py_DECREF(restuple);
2314 return resunicode;
2315}
2316
2317static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2318 int size,
2319 const char *errors,
2320 int limit)
2321{
2322 /* output object */
2323 PyObject *res;
2324 /* pointers to the beginning and end+1 of input */
2325 const Py_UNICODE *startp = p;
2326 const Py_UNICODE *endp = p + size;
2327 /* pointer to the beginning of the unencodable characters */
2328 /* const Py_UNICODE *badp = NULL; */
2329 /* pointer into the output */
2330 char *str;
2331 /* current output position */
2332 int respos = 0;
2333 int ressize;
2334 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2335 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2336 PyObject *errorHandler = NULL;
2337 PyObject *exc = NULL;
2338 /* the following variable is used for caching string comparisons
2339 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2340 int known_errorHandler = -1;
2341
2342 /* allocate enough for a simple encoding without
2343 replacements, if we need more, we'll resize */
2344 res = PyString_FromStringAndSize(NULL, size);
2345 if (res == NULL)
2346 goto onError;
2347 if (size == 0)
2348 return res;
2349 str = PyString_AS_STRING(res);
2350 ressize = size;
2351
2352 while (p<endp) {
2353 Py_UNICODE c = *p;
2354
2355 /* can we encode this? */
2356 if (c<limit) {
2357 /* no overflow check, because we know that the space is enough */
2358 *str++ = (char)c;
2359 ++p;
2360 }
2361 else {
2362 int unicodepos = p-startp;
2363 int requiredsize;
2364 PyObject *repunicode;
2365 int repsize;
2366 int newpos;
2367 int respos;
2368 Py_UNICODE *uni2;
2369 /* startpos for collecting unencodable chars */
2370 const Py_UNICODE *collstart = p;
2371 const Py_UNICODE *collend = p;
2372 /* find all unecodable characters */
2373 while ((collend < endp) && ((*collend)>=limit))
2374 ++collend;
2375 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2376 if (known_errorHandler==-1) {
2377 if ((errors==NULL) || (!strcmp(errors, "strict")))
2378 known_errorHandler = 1;
2379 else if (!strcmp(errors, "replace"))
2380 known_errorHandler = 2;
2381 else if (!strcmp(errors, "ignore"))
2382 known_errorHandler = 3;
2383 else if (!strcmp(errors, "xmlcharrefreplace"))
2384 known_errorHandler = 4;
2385 else
2386 known_errorHandler = 0;
2387 }
2388 switch (known_errorHandler) {
2389 case 1: /* strict */
2390 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2391 goto onError;
2392 case 2: /* replace */
2393 while (collstart++<collend)
2394 *str++ = '?'; /* fall through */
2395 case 3: /* ignore */
2396 p = collend;
2397 break;
2398 case 4: /* xmlcharrefreplace */
2399 respos = str-PyString_AS_STRING(res);
2400 /* determine replacement size (temporarily (mis)uses p) */
2401 for (p = collstart, repsize = 0; p < collend; ++p) {
2402 if (*p<10)
2403 repsize += 2+1+1;
2404 else if (*p<100)
2405 repsize += 2+2+1;
2406 else if (*p<1000)
2407 repsize += 2+3+1;
2408 else if (*p<10000)
2409 repsize += 2+4+1;
2410 else if (*p<100000)
2411 repsize += 2+5+1;
2412 else if (*p<1000000)
2413 repsize += 2+6+1;
2414 else
2415 repsize += 2+7+1;
2416 }
2417 requiredsize = respos+repsize+(endp-collend);
2418 if (requiredsize > ressize) {
2419 if (requiredsize<2*ressize)
2420 requiredsize = 2*ressize;
2421 if (_PyString_Resize(&res, requiredsize))
2422 goto onError;
2423 str = PyString_AS_STRING(res) + respos;
2424 ressize = requiredsize;
2425 }
2426 /* generate replacement (temporarily (mis)uses p) */
2427 for (p = collstart; p < collend; ++p) {
2428 str += sprintf(str, "&#%d;", (int)*p);
2429 }
2430 p = collend;
2431 break;
2432 default:
2433 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2434 encoding, reason, startp, size, &exc,
2435 collstart-startp, collend-startp, &newpos);
2436 if (repunicode == NULL)
2437 goto onError;
2438 /* need more space? (at least enough for what we
2439 have+the replacement+the rest of the string, so
2440 we won't have to check space for encodable characters) */
2441 respos = str-PyString_AS_STRING(res);
2442 repsize = PyUnicode_GET_SIZE(repunicode);
2443 requiredsize = respos+repsize+(endp-collend);
2444 if (requiredsize > ressize) {
2445 if (requiredsize<2*ressize)
2446 requiredsize = 2*ressize;
2447 if (_PyString_Resize(&res, requiredsize)) {
2448 Py_DECREF(repunicode);
2449 goto onError;
2450 }
2451 str = PyString_AS_STRING(res) + respos;
2452 ressize = requiredsize;
2453 }
2454 /* check if there is anything unencodable in the replacement
2455 and copy it to the output */
2456 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2457 c = *uni2;
2458 if (c >= limit) {
2459 raise_encode_exception(&exc, encoding, startp, size,
2460 unicodepos, unicodepos+1, reason);
2461 Py_DECREF(repunicode);
2462 goto onError;
2463 }
2464 *str = (char)c;
2465 }
2466 p = startp + newpos;
2467 Py_DECREF(repunicode);
2468 }
2469 }
2470 }
2471 /* Resize if we allocated to much */
2472 respos = str-PyString_AS_STRING(res);
2473 if (respos<ressize)
2474 /* If this falls res will be NULL */
2475 _PyString_Resize(&res, respos);
2476 Py_XDECREF(errorHandler);
2477 Py_XDECREF(exc);
2478 return res;
2479
2480 onError:
2481 Py_XDECREF(res);
2482 Py_XDECREF(errorHandler);
2483 Py_XDECREF(exc);
2484 return NULL;
2485}
2486
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2488 int size,
2489 const char *errors)
2490{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002491 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492}
2493
2494PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2495{
2496 if (!PyUnicode_Check(unicode)) {
2497 PyErr_BadArgument();
2498 return NULL;
2499 }
2500 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2501 PyUnicode_GET_SIZE(unicode),
2502 NULL);
2503}
2504
2505/* --- 7-bit ASCII Codec -------------------------------------------------- */
2506
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507PyObject *PyUnicode_DecodeASCII(const char *s,
2508 int size,
2509 const char *errors)
2510{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002511 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512 PyUnicodeObject *v;
2513 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002514 int startinpos;
2515 int endinpos;
2516 int outpos;
2517 const char *e;
2518 PyObject *errorHandler = NULL;
2519 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002520
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002522 if (size == 1 && *(unsigned char*)s < 128) {
2523 Py_UNICODE r = *(unsigned char*)s;
2524 return PyUnicode_FromUnicode(&r, 1);
2525 }
Tim Petersced69f82003-09-16 20:30:58 +00002526
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527 v = _PyUnicode_New(size);
2528 if (v == NULL)
2529 goto onError;
2530 if (size == 0)
2531 return (PyObject *)v;
2532 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002533 e = s + size;
2534 while (s < e) {
2535 register unsigned char c = (unsigned char)*s;
2536 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002538 ++s;
2539 }
2540 else {
2541 startinpos = s-starts;
2542 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002543 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002544 if (unicode_decode_call_errorhandler(
2545 errors, &errorHandler,
2546 "ascii", "ordinal not in range(128)",
2547 starts, size, &startinpos, &endinpos, &exc, &s,
2548 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002552 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002553 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002554 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002555 Py_XDECREF(errorHandler);
2556 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002558
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 onError:
2560 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002561 Py_XDECREF(errorHandler);
2562 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563 return NULL;
2564}
2565
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2567 int size,
2568 const char *errors)
2569{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002570 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002571}
2572
2573PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2574{
2575 if (!PyUnicode_Check(unicode)) {
2576 PyErr_BadArgument();
2577 return NULL;
2578 }
2579 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2580 PyUnicode_GET_SIZE(unicode),
2581 NULL);
2582}
2583
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002584#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002585
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002586/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002587
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002588PyObject *PyUnicode_DecodeMBCS(const char *s,
2589 int size,
2590 const char *errors)
2591{
2592 PyUnicodeObject *v;
2593 Py_UNICODE *p;
2594
2595 /* First get the size of the result */
2596 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002597 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002598 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2599
2600 v = _PyUnicode_New(usize);
2601 if (v == NULL)
2602 return NULL;
2603 if (usize == 0)
2604 return (PyObject *)v;
2605 p = PyUnicode_AS_UNICODE(v);
2606 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2607 Py_DECREF(v);
2608 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2609 }
2610
2611 return (PyObject *)v;
2612}
2613
2614PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2615 int size,
2616 const char *errors)
2617{
2618 PyObject *repr;
2619 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002620 DWORD mbcssize;
2621
2622 /* If there are no characters, bail now! */
2623 if (size==0)
2624 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002625
2626 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002627 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002628 if (mbcssize==0)
2629 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2630
2631 repr = PyString_FromStringAndSize(NULL, mbcssize);
2632 if (repr == NULL)
2633 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002634 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002635 return repr;
2636
2637 /* Do the conversion */
2638 s = PyString_AS_STRING(repr);
2639 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2640 Py_DECREF(repr);
2641 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2642 }
2643 return repr;
2644}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002645
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002646PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2647{
2648 if (!PyUnicode_Check(unicode)) {
2649 PyErr_BadArgument();
2650 return NULL;
2651 }
2652 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2653 PyUnicode_GET_SIZE(unicode),
2654 NULL);
2655}
2656
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002657#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002658
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659/* --- Character Mapping Codec -------------------------------------------- */
2660
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661PyObject *PyUnicode_DecodeCharmap(const char *s,
2662 int size,
2663 PyObject *mapping,
2664 const char *errors)
2665{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002666 const char *starts = s;
2667 int startinpos;
2668 int endinpos;
2669 int outpos;
2670 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 PyUnicodeObject *v;
2672 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002673 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002674 PyObject *errorHandler = NULL;
2675 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002676
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 /* Default to Latin-1 */
2678 if (mapping == NULL)
2679 return PyUnicode_DecodeLatin1(s, size, errors);
2680
2681 v = _PyUnicode_New(size);
2682 if (v == NULL)
2683 goto onError;
2684 if (size == 0)
2685 return (PyObject *)v;
2686 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002687 e = s + size;
2688 while (s < e) {
2689 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 PyObject *w, *x;
2691
2692 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2693 w = PyInt_FromLong((long)ch);
2694 if (w == NULL)
2695 goto onError;
2696 x = PyObject_GetItem(mapping, w);
2697 Py_DECREF(w);
2698 if (x == NULL) {
2699 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002700 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002702 x = Py_None;
2703 Py_INCREF(x);
2704 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002705 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 }
2707
2708 /* Apply mapping */
2709 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002710 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002711 if (value < 0 || value > 65535) {
2712 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002713 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714 Py_DECREF(x);
2715 goto onError;
2716 }
2717 *p++ = (Py_UNICODE)value;
2718 }
2719 else if (x == Py_None) {
2720 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002721 outpos = p-PyUnicode_AS_UNICODE(v);
2722 startinpos = s-starts;
2723 endinpos = startinpos+1;
2724 if (unicode_decode_call_errorhandler(
2725 errors, &errorHandler,
2726 "charmap", "character maps to <undefined>",
2727 starts, size, &startinpos, &endinpos, &exc, &s,
2728 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729 Py_DECREF(x);
2730 goto onError;
2731 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002732 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733 }
2734 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002735 int targetsize = PyUnicode_GET_SIZE(x);
2736
2737 if (targetsize == 1)
2738 /* 1-1 mapping */
2739 *p++ = *PyUnicode_AS_UNICODE(x);
2740
2741 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002743 if (targetsize > extrachars) {
2744 /* resize first */
2745 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2746 int needed = (targetsize - extrachars) + \
2747 (targetsize << 2);
2748 extrachars += needed;
Tim Petersced69f82003-09-16 20:30:58 +00002749 if (_PyUnicode_Resize(&v,
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002750 PyUnicode_GET_SIZE(v) + needed) < 0) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002751 Py_DECREF(x);
2752 goto onError;
2753 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002754 p = PyUnicode_AS_UNICODE(v) + oldpos;
2755 }
2756 Py_UNICODE_COPY(p,
2757 PyUnicode_AS_UNICODE(x),
2758 targetsize);
2759 p += targetsize;
2760 extrachars -= targetsize;
2761 }
2762 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763 }
2764 else {
2765 /* wrong return value */
2766 PyErr_SetString(PyExc_TypeError,
2767 "character mapping must return integer, None or unicode");
2768 Py_DECREF(x);
2769 goto onError;
2770 }
2771 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002772 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 }
2774 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002775 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002777 Py_XDECREF(errorHandler);
2778 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002780
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002782 Py_XDECREF(errorHandler);
2783 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 Py_XDECREF(v);
2785 return NULL;
2786}
2787
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002788/* Lookup the character ch in the mapping. If the character
2789 can't be found, Py_None is returned (or NULL, if another
2790 error occured). */
2791static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002793 PyObject *w = PyInt_FromLong((long)c);
2794 PyObject *x;
2795
2796 if (w == NULL)
2797 return NULL;
2798 x = PyObject_GetItem(mapping, w);
2799 Py_DECREF(w);
2800 if (x == NULL) {
2801 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2802 /* No mapping found means: mapping is undefined. */
2803 PyErr_Clear();
2804 x = Py_None;
2805 Py_INCREF(x);
2806 return x;
2807 } else
2808 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002810 else if (x == Py_None)
2811 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002812 else if (PyInt_Check(x)) {
2813 long value = PyInt_AS_LONG(x);
2814 if (value < 0 || value > 255) {
2815 PyErr_SetString(PyExc_TypeError,
2816 "character mapping must be in range(256)");
2817 Py_DECREF(x);
2818 return NULL;
2819 }
2820 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822 else if (PyString_Check(x))
2823 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002825 /* wrong return value */
2826 PyErr_SetString(PyExc_TypeError,
2827 "character mapping must return integer, None or str");
2828 Py_DECREF(x);
2829 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830 }
2831}
2832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002833/* lookup the character, put the result in the output string and adjust
2834 various state variables. Reallocate the output string if not enough
2835 space is available. Return a new reference to the object that
2836 was put in the output buffer, or Py_None, if the mapping was undefined
2837 (in which case no character was written) or NULL, if a
2838 reallocation error ocurred. The called must decref the result */
2839static
2840PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2841 PyObject **outobj, int *outpos)
2842{
2843 PyObject *rep = charmapencode_lookup(c, mapping);
2844
2845 if (rep==NULL)
2846 return NULL;
2847 else if (rep==Py_None)
2848 return rep;
2849 else {
2850 char *outstart = PyString_AS_STRING(*outobj);
2851 int outsize = PyString_GET_SIZE(*outobj);
2852 if (PyInt_Check(rep)) {
2853 int requiredsize = *outpos+1;
2854 if (outsize<requiredsize) {
2855 /* exponentially overallocate to minimize reallocations */
2856 if (requiredsize < 2*outsize)
2857 requiredsize = 2*outsize;
2858 if (_PyString_Resize(outobj, requiredsize)) {
2859 Py_DECREF(rep);
2860 return NULL;
2861 }
2862 outstart = PyString_AS_STRING(*outobj);
2863 }
2864 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2865 }
2866 else {
2867 const char *repchars = PyString_AS_STRING(rep);
2868 int repsize = PyString_GET_SIZE(rep);
2869 int requiredsize = *outpos+repsize;
2870 if (outsize<requiredsize) {
2871 /* exponentially overallocate to minimize reallocations */
2872 if (requiredsize < 2*outsize)
2873 requiredsize = 2*outsize;
2874 if (_PyString_Resize(outobj, requiredsize)) {
2875 Py_DECREF(rep);
2876 return NULL;
2877 }
2878 outstart = PyString_AS_STRING(*outobj);
2879 }
2880 memcpy(outstart + *outpos, repchars, repsize);
2881 *outpos += repsize;
2882 }
2883 }
2884 return rep;
2885}
2886
2887/* handle an error in PyUnicode_EncodeCharmap
2888 Return 0 on success, -1 on error */
2889static
2890int charmap_encoding_error(
2891 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2892 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002893 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002894 PyObject **res, int *respos)
2895{
2896 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2897 int repsize;
2898 int newpos;
2899 Py_UNICODE *uni2;
2900 /* startpos for collecting unencodable chars */
2901 int collstartpos = *inpos;
2902 int collendpos = *inpos+1;
2903 int collpos;
2904 char *encoding = "charmap";
2905 char *reason = "character maps to <undefined>";
2906
2907 PyObject *x;
2908 /* find all unencodable characters */
2909 while (collendpos < size) {
2910 x = charmapencode_lookup(p[collendpos], mapping);
2911 if (x==NULL)
2912 return -1;
2913 else if (x!=Py_None) {
2914 Py_DECREF(x);
2915 break;
2916 }
2917 Py_DECREF(x);
2918 ++collendpos;
2919 }
2920 /* cache callback name lookup
2921 * (if not done yet, i.e. it's the first error) */
2922 if (*known_errorHandler==-1) {
2923 if ((errors==NULL) || (!strcmp(errors, "strict")))
2924 *known_errorHandler = 1;
2925 else if (!strcmp(errors, "replace"))
2926 *known_errorHandler = 2;
2927 else if (!strcmp(errors, "ignore"))
2928 *known_errorHandler = 3;
2929 else if (!strcmp(errors, "xmlcharrefreplace"))
2930 *known_errorHandler = 4;
2931 else
2932 *known_errorHandler = 0;
2933 }
2934 switch (*known_errorHandler) {
2935 case 1: /* strict */
2936 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2937 return -1;
2938 case 2: /* replace */
2939 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2940 x = charmapencode_output('?', mapping, res, respos);
2941 if (x==NULL) {
2942 return -1;
2943 }
2944 else if (x==Py_None) {
2945 Py_DECREF(x);
2946 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2947 return -1;
2948 }
2949 Py_DECREF(x);
2950 }
2951 /* fall through */
2952 case 3: /* ignore */
2953 *inpos = collendpos;
2954 break;
2955 case 4: /* xmlcharrefreplace */
2956 /* generate replacement (temporarily (mis)uses p) */
2957 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2958 char buffer[2+29+1+1];
2959 char *cp;
2960 sprintf(buffer, "&#%d;", (int)p[collpos]);
2961 for (cp = buffer; *cp; ++cp) {
2962 x = charmapencode_output(*cp, mapping, res, respos);
2963 if (x==NULL)
2964 return -1;
2965 else if (x==Py_None) {
2966 Py_DECREF(x);
2967 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2968 return -1;
2969 }
2970 Py_DECREF(x);
2971 }
2972 }
2973 *inpos = collendpos;
2974 break;
2975 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002976 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002977 encoding, reason, p, size, exceptionObject,
2978 collstartpos, collendpos, &newpos);
2979 if (repunicode == NULL)
2980 return -1;
2981 /* generate replacement */
2982 repsize = PyUnicode_GET_SIZE(repunicode);
2983 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2984 x = charmapencode_output(*uni2, mapping, res, respos);
2985 if (x==NULL) {
2986 Py_DECREF(repunicode);
2987 return -1;
2988 }
2989 else if (x==Py_None) {
2990 Py_DECREF(repunicode);
2991 Py_DECREF(x);
2992 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2993 return -1;
2994 }
2995 Py_DECREF(x);
2996 }
2997 *inpos = newpos;
2998 Py_DECREF(repunicode);
2999 }
3000 return 0;
3001}
3002
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3004 int size,
3005 PyObject *mapping,
3006 const char *errors)
3007{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003008 /* output object */
3009 PyObject *res = NULL;
3010 /* current input position */
3011 int inpos = 0;
3012 /* current output position */
3013 int respos = 0;
3014 PyObject *errorHandler = NULL;
3015 PyObject *exc = NULL;
3016 /* the following variable is used for caching string comparisons
3017 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3018 * 3=ignore, 4=xmlcharrefreplace */
3019 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020
3021 /* Default to Latin-1 */
3022 if (mapping == NULL)
3023 return PyUnicode_EncodeLatin1(p, size, errors);
3024
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003025 /* allocate enough for a simple encoding without
3026 replacements, if we need more, we'll resize */
3027 res = PyString_FromStringAndSize(NULL, size);
3028 if (res == NULL)
3029 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003030 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003031 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003033 while (inpos<size) {
3034 /* try to encode it */
3035 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3036 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003038 if (x==Py_None) { /* unencodable character */
3039 if (charmap_encoding_error(p, size, &inpos, mapping,
3040 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003041 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003042 &res, &respos)) {
3043 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003044 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003047 else
3048 /* done with this character => adjust input position */
3049 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 Py_DECREF(x);
3051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 /* Resize if we allocated to much */
3054 if (respos<PyString_GET_SIZE(res)) {
3055 if (_PyString_Resize(&res, respos))
3056 goto onError;
3057 }
3058 Py_XDECREF(exc);
3059 Py_XDECREF(errorHandler);
3060 return res;
3061
3062 onError:
3063 Py_XDECREF(res);
3064 Py_XDECREF(exc);
3065 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003066 return NULL;
3067}
3068
3069PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3070 PyObject *mapping)
3071{
3072 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3073 PyErr_BadArgument();
3074 return NULL;
3075 }
3076 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3077 PyUnicode_GET_SIZE(unicode),
3078 mapping,
3079 NULL);
3080}
3081
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003082/* create or adjust a UnicodeTranslateError */
3083static void make_translate_exception(PyObject **exceptionObject,
3084 const Py_UNICODE *unicode, int size,
3085 int startpos, int endpos,
3086 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003088 if (*exceptionObject == NULL) {
3089 *exceptionObject = PyUnicodeTranslateError_Create(
3090 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 }
3092 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003093 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3094 goto onError;
3095 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3096 goto onError;
3097 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3098 goto onError;
3099 return;
3100 onError:
3101 Py_DECREF(*exceptionObject);
3102 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103 }
3104}
3105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003106/* raises a UnicodeTranslateError */
3107static void raise_translate_exception(PyObject **exceptionObject,
3108 const Py_UNICODE *unicode, int size,
3109 int startpos, int endpos,
3110 const char *reason)
3111{
3112 make_translate_exception(exceptionObject,
3113 unicode, size, startpos, endpos, reason);
3114 if (*exceptionObject != NULL)
3115 PyCodec_StrictErrors(*exceptionObject);
3116}
3117
3118/* error handling callback helper:
3119 build arguments, call the callback and check the arguments,
3120 put the result into newpos and return the replacement string, which
3121 has to be freed by the caller */
3122static PyObject *unicode_translate_call_errorhandler(const char *errors,
3123 PyObject **errorHandler,
3124 const char *reason,
3125 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3126 int startpos, int endpos,
3127 int *newpos)
3128{
3129 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3130
3131 PyObject *restuple;
3132 PyObject *resunicode;
3133
3134 if (*errorHandler == NULL) {
3135 *errorHandler = PyCodec_LookupError(errors);
3136 if (*errorHandler == NULL)
3137 return NULL;
3138 }
3139
3140 make_translate_exception(exceptionObject,
3141 unicode, size, startpos, endpos, reason);
3142 if (*exceptionObject == NULL)
3143 return NULL;
3144
3145 restuple = PyObject_CallFunctionObjArgs(
3146 *errorHandler, *exceptionObject, NULL);
3147 if (restuple == NULL)
3148 return NULL;
3149 if (!PyTuple_Check(restuple)) {
3150 PyErr_Format(PyExc_TypeError, &argparse[4]);
3151 Py_DECREF(restuple);
3152 return NULL;
3153 }
3154 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3155 &resunicode, newpos)) {
3156 Py_DECREF(restuple);
3157 return NULL;
3158 }
3159 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003160 *newpos = size+*newpos;
3161 if (*newpos<0 || *newpos>size) {
3162 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3163 Py_DECREF(restuple);
3164 return NULL;
3165 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003166 Py_INCREF(resunicode);
3167 Py_DECREF(restuple);
3168 return resunicode;
3169}
3170
3171/* Lookup the character ch in the mapping and put the result in result,
3172 which must be decrefed by the caller.
3173 Return 0 on success, -1 on error */
3174static
3175int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3176{
3177 PyObject *w = PyInt_FromLong((long)c);
3178 PyObject *x;
3179
3180 if (w == NULL)
3181 return -1;
3182 x = PyObject_GetItem(mapping, w);
3183 Py_DECREF(w);
3184 if (x == NULL) {
3185 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3186 /* No mapping found means: use 1:1 mapping. */
3187 PyErr_Clear();
3188 *result = NULL;
3189 return 0;
3190 } else
3191 return -1;
3192 }
3193 else if (x == Py_None) {
3194 *result = x;
3195 return 0;
3196 }
3197 else if (PyInt_Check(x)) {
3198 long value = PyInt_AS_LONG(x);
3199 long max = PyUnicode_GetMax();
3200 if (value < 0 || value > max) {
3201 PyErr_Format(PyExc_TypeError,
3202 "character mapping must be in range(0x%lx)", max+1);
3203 Py_DECREF(x);
3204 return -1;
3205 }
3206 *result = x;
3207 return 0;
3208 }
3209 else if (PyUnicode_Check(x)) {
3210 *result = x;
3211 return 0;
3212 }
3213 else {
3214 /* wrong return value */
3215 PyErr_SetString(PyExc_TypeError,
3216 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003217 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003218 return -1;
3219 }
3220}
3221/* ensure that *outobj is at least requiredsize characters long,
3222if not reallocate and adjust various state variables.
3223Return 0 on success, -1 on error */
3224static
Walter Dörwald4894c302003-10-24 14:25:28 +00003225int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003226 int requiredsize)
3227{
Walter Dörwald4894c302003-10-24 14:25:28 +00003228 int oldsize = PyUnicode_GET_SIZE(*outobj);
3229 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003230 /* remember old output position */
3231 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3232 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003233 if (requiredsize < 2 * oldsize)
3234 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003235 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236 return -1;
3237 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003238 }
3239 return 0;
3240}
3241/* lookup the character, put the result in the output string and adjust
3242 various state variables. Return a new reference to the object that
3243 was put in the output buffer in *result, or Py_None, if the mapping was
3244 undefined (in which case no character was written).
3245 The called must decref result.
3246 Return 0 on success, -1 on error. */
3247static
Walter Dörwald4894c302003-10-24 14:25:28 +00003248int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3249 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3250 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003251{
Walter Dörwald4894c302003-10-24 14:25:28 +00003252 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003253 return -1;
3254 if (*res==NULL) {
3255 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003256 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003257 }
3258 else if (*res==Py_None)
3259 ;
3260 else if (PyInt_Check(*res)) {
3261 /* no overflow check, because we know that the space is enough */
3262 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3263 }
3264 else if (PyUnicode_Check(*res)) {
3265 int repsize = PyUnicode_GET_SIZE(*res);
3266 if (repsize==1) {
3267 /* no overflow check, because we know that the space is enough */
3268 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3269 }
3270 else if (repsize!=0) {
3271 /* more than one character */
Walter Dörwald4894c302003-10-24 14:25:28 +00003272 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
3273 (insize - (*curinp-*startinp)) +
3274 repsize - 1;
3275 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003276 return -1;
3277 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3278 *outp += repsize;
3279 }
3280 }
3281 else
3282 return -1;
3283 return 0;
3284}
3285
3286PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287 int size,
3288 PyObject *mapping,
3289 const char *errors)
3290{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003291 /* output object */
3292 PyObject *res = NULL;
3293 /* pointers to the beginning and end+1 of input */
3294 const Py_UNICODE *startp = p;
3295 const Py_UNICODE *endp = p + size;
3296 /* pointer into the output */
3297 Py_UNICODE *str;
3298 /* current output position */
3299 int respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003300 char *reason = "character maps to <undefined>";
3301 PyObject *errorHandler = NULL;
3302 PyObject *exc = NULL;
3303 /* the following variable is used for caching string comparisons
3304 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3305 * 3=ignore, 4=xmlcharrefreplace */
3306 int known_errorHandler = -1;
3307
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308 if (mapping == NULL) {
3309 PyErr_BadArgument();
3310 return NULL;
3311 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003312
3313 /* allocate enough for a simple 1:1 translation without
3314 replacements, if we need more, we'll resize */
3315 res = PyUnicode_FromUnicode(NULL, size);
3316 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003317 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003319 return res;
3320 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003322 while (p<endp) {
3323 /* try to encode it */
3324 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003325 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003326 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 goto onError;
3328 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003329 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330 if (x!=Py_None) /* it worked => adjust input pointer */
3331 ++p;
3332 else { /* untranslatable character */
3333 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3334 int repsize;
3335 int newpos;
3336 Py_UNICODE *uni2;
3337 /* startpos for collecting untranslatable chars */
3338 const Py_UNICODE *collstart = p;
3339 const Py_UNICODE *collend = p+1;
3340 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342 /* find all untranslatable characters */
3343 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003344 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003345 goto onError;
3346 Py_XDECREF(x);
3347 if (x!=Py_None)
3348 break;
3349 ++collend;
3350 }
3351 /* cache callback name lookup
3352 * (if not done yet, i.e. it's the first error) */
3353 if (known_errorHandler==-1) {
3354 if ((errors==NULL) || (!strcmp(errors, "strict")))
3355 known_errorHandler = 1;
3356 else if (!strcmp(errors, "replace"))
3357 known_errorHandler = 2;
3358 else if (!strcmp(errors, "ignore"))
3359 known_errorHandler = 3;
3360 else if (!strcmp(errors, "xmlcharrefreplace"))
3361 known_errorHandler = 4;
3362 else
3363 known_errorHandler = 0;
3364 }
3365 switch (known_errorHandler) {
3366 case 1: /* strict */
3367 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3368 goto onError;
3369 case 2: /* replace */
3370 /* No need to check for space, this is a 1:1 replacement */
3371 for (coll = collstart; coll<collend; ++coll)
3372 *str++ = '?';
3373 /* fall through */
3374 case 3: /* ignore */
3375 p = collend;
3376 break;
3377 case 4: /* xmlcharrefreplace */
3378 /* generate replacement (temporarily (mis)uses p) */
3379 for (p = collstart; p < collend; ++p) {
3380 char buffer[2+29+1+1];
3381 char *cp;
3382 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003383 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003384 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3385 goto onError;
3386 for (cp = buffer; *cp; ++cp)
3387 *str++ = *cp;
3388 }
3389 p = collend;
3390 break;
3391 default:
3392 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3393 reason, startp, size, &exc,
3394 collstart-startp, collend-startp, &newpos);
3395 if (repunicode == NULL)
3396 goto onError;
3397 /* generate replacement */
3398 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003399 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003400 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3401 Py_DECREF(repunicode);
3402 goto onError;
3403 }
3404 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3405 *str++ = *uni2;
3406 p = startp + newpos;
3407 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408 }
3409 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003411 /* Resize if we allocated to much */
3412 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003413 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003414 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003415 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003416 }
3417 Py_XDECREF(exc);
3418 Py_XDECREF(errorHandler);
3419 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003421 onError:
3422 Py_XDECREF(res);
3423 Py_XDECREF(exc);
3424 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425 return NULL;
3426}
3427
3428PyObject *PyUnicode_Translate(PyObject *str,
3429 PyObject *mapping,
3430 const char *errors)
3431{
3432 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003433
Guido van Rossumd57fd912000-03-10 22:53:23 +00003434 str = PyUnicode_FromObject(str);
3435 if (str == NULL)
3436 goto onError;
3437 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3438 PyUnicode_GET_SIZE(str),
3439 mapping,
3440 errors);
3441 Py_DECREF(str);
3442 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003443
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444 onError:
3445 Py_XDECREF(str);
3446 return NULL;
3447}
Tim Petersced69f82003-09-16 20:30:58 +00003448
Guido van Rossum9e896b32000-04-05 20:11:21 +00003449/* --- Decimal Encoder ---------------------------------------------------- */
3450
3451int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3452 int length,
3453 char *output,
3454 const char *errors)
3455{
3456 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003457 PyObject *errorHandler = NULL;
3458 PyObject *exc = NULL;
3459 const char *encoding = "decimal";
3460 const char *reason = "invalid decimal Unicode string";
3461 /* the following variable is used for caching string comparisons
3462 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3463 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003464
3465 if (output == NULL) {
3466 PyErr_BadArgument();
3467 return -1;
3468 }
3469
3470 p = s;
3471 end = s + length;
3472 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003473 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003474 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003475 PyObject *repunicode;
3476 int repsize;
3477 int newpos;
3478 Py_UNICODE *uni2;
3479 Py_UNICODE *collstart;
3480 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003481
Guido van Rossum9e896b32000-04-05 20:11:21 +00003482 if (Py_UNICODE_ISSPACE(ch)) {
3483 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003484 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003485 continue;
3486 }
3487 decimal = Py_UNICODE_TODECIMAL(ch);
3488 if (decimal >= 0) {
3489 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003490 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003491 continue;
3492 }
Guido van Rossumba477042000-04-06 18:18:10 +00003493 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003494 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003495 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003496 continue;
3497 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498 /* All other characters are considered unencodable */
3499 collstart = p;
3500 collend = p+1;
3501 while (collend < end) {
3502 if ((0 < *collend && *collend < 256) ||
3503 !Py_UNICODE_ISSPACE(*collend) ||
3504 Py_UNICODE_TODECIMAL(*collend))
3505 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003506 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507 /* cache callback name lookup
3508 * (if not done yet, i.e. it's the first error) */
3509 if (known_errorHandler==-1) {
3510 if ((errors==NULL) || (!strcmp(errors, "strict")))
3511 known_errorHandler = 1;
3512 else if (!strcmp(errors, "replace"))
3513 known_errorHandler = 2;
3514 else if (!strcmp(errors, "ignore"))
3515 known_errorHandler = 3;
3516 else if (!strcmp(errors, "xmlcharrefreplace"))
3517 known_errorHandler = 4;
3518 else
3519 known_errorHandler = 0;
3520 }
3521 switch (known_errorHandler) {
3522 case 1: /* strict */
3523 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3524 goto onError;
3525 case 2: /* replace */
3526 for (p = collstart; p < collend; ++p)
3527 *output++ = '?';
3528 /* fall through */
3529 case 3: /* ignore */
3530 p = collend;
3531 break;
3532 case 4: /* xmlcharrefreplace */
3533 /* generate replacement (temporarily (mis)uses p) */
3534 for (p = collstart; p < collend; ++p)
3535 output += sprintf(output, "&#%d;", (int)*p);
3536 p = collend;
3537 break;
3538 default:
3539 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3540 encoding, reason, s, length, &exc,
3541 collstart-s, collend-s, &newpos);
3542 if (repunicode == NULL)
3543 goto onError;
3544 /* generate replacement */
3545 repsize = PyUnicode_GET_SIZE(repunicode);
3546 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3547 Py_UNICODE ch = *uni2;
3548 if (Py_UNICODE_ISSPACE(ch))
3549 *output++ = ' ';
3550 else {
3551 decimal = Py_UNICODE_TODECIMAL(ch);
3552 if (decimal >= 0)
3553 *output++ = '0' + decimal;
3554 else if (0 < ch && ch < 256)
3555 *output++ = (char)ch;
3556 else {
3557 Py_DECREF(repunicode);
3558 raise_encode_exception(&exc, encoding,
3559 s, length, collstart-s, collend-s, reason);
3560 goto onError;
3561 }
3562 }
3563 }
3564 p = s + newpos;
3565 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003566 }
3567 }
3568 /* 0-terminate the output string */
3569 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 Py_XDECREF(exc);
3571 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003572 return 0;
3573
3574 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003575 Py_XDECREF(exc);
3576 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003577 return -1;
3578}
3579
Guido van Rossumd57fd912000-03-10 22:53:23 +00003580/* --- Helpers ------------------------------------------------------------ */
3581
Tim Petersced69f82003-09-16 20:30:58 +00003582static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583int count(PyUnicodeObject *self,
3584 int start,
3585 int end,
3586 PyUnicodeObject *substring)
3587{
3588 int count = 0;
3589
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003590 if (start < 0)
3591 start += self->length;
3592 if (start < 0)
3593 start = 0;
3594 if (end > self->length)
3595 end = self->length;
3596 if (end < 0)
3597 end += self->length;
3598 if (end < 0)
3599 end = 0;
3600
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003601 if (substring->length == 0)
3602 return (end - start + 1);
3603
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604 end -= substring->length;
3605
3606 while (start <= end)
3607 if (Py_UNICODE_MATCH(self, start, substring)) {
3608 count++;
3609 start += substring->length;
3610 } else
3611 start++;
3612
3613 return count;
3614}
3615
3616int PyUnicode_Count(PyObject *str,
3617 PyObject *substr,
3618 int start,
3619 int end)
3620{
3621 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003622
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623 str = PyUnicode_FromObject(str);
3624 if (str == NULL)
3625 return -1;
3626 substr = PyUnicode_FromObject(substr);
3627 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003628 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003629 return -1;
3630 }
Tim Petersced69f82003-09-16 20:30:58 +00003631
Guido van Rossumd57fd912000-03-10 22:53:23 +00003632 result = count((PyUnicodeObject *)str,
3633 start, end,
3634 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003635
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636 Py_DECREF(str);
3637 Py_DECREF(substr);
3638 return result;
3639}
3640
Tim Petersced69f82003-09-16 20:30:58 +00003641static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642int findstring(PyUnicodeObject *self,
3643 PyUnicodeObject *substring,
3644 int start,
3645 int end,
3646 int direction)
3647{
3648 if (start < 0)
3649 start += self->length;
3650 if (start < 0)
3651 start = 0;
3652
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 if (end > self->length)
3654 end = self->length;
3655 if (end < 0)
3656 end += self->length;
3657 if (end < 0)
3658 end = 0;
3659
Guido van Rossum76afbd92002-08-20 17:29:29 +00003660 if (substring->length == 0)
3661 return (direction > 0) ? start : end;
3662
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 end -= substring->length;
3664
3665 if (direction < 0) {
3666 for (; end >= start; end--)
3667 if (Py_UNICODE_MATCH(self, end, substring))
3668 return end;
3669 } else {
3670 for (; start <= end; start++)
3671 if (Py_UNICODE_MATCH(self, start, substring))
3672 return start;
3673 }
3674
3675 return -1;
3676}
3677
3678int PyUnicode_Find(PyObject *str,
3679 PyObject *substr,
3680 int start,
3681 int end,
3682 int direction)
3683{
3684 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003685
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686 str = PyUnicode_FromObject(str);
3687 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003688 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 substr = PyUnicode_FromObject(substr);
3690 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003691 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003692 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693 }
Tim Petersced69f82003-09-16 20:30:58 +00003694
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 result = findstring((PyUnicodeObject *)str,
3696 (PyUnicodeObject *)substr,
3697 start, end, direction);
3698 Py_DECREF(str);
3699 Py_DECREF(substr);
3700 return result;
3701}
3702
Tim Petersced69f82003-09-16 20:30:58 +00003703static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704int tailmatch(PyUnicodeObject *self,
3705 PyUnicodeObject *substring,
3706 int start,
3707 int end,
3708 int direction)
3709{
3710 if (start < 0)
3711 start += self->length;
3712 if (start < 0)
3713 start = 0;
3714
3715 if (substring->length == 0)
3716 return 1;
3717
3718 if (end > self->length)
3719 end = self->length;
3720 if (end < 0)
3721 end += self->length;
3722 if (end < 0)
3723 end = 0;
3724
3725 end -= substring->length;
3726 if (end < start)
3727 return 0;
3728
3729 if (direction > 0) {
3730 if (Py_UNICODE_MATCH(self, end, substring))
3731 return 1;
3732 } else {
3733 if (Py_UNICODE_MATCH(self, start, substring))
3734 return 1;
3735 }
3736
3737 return 0;
3738}
3739
3740int PyUnicode_Tailmatch(PyObject *str,
3741 PyObject *substr,
3742 int start,
3743 int end,
3744 int direction)
3745{
3746 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003747
Guido van Rossumd57fd912000-03-10 22:53:23 +00003748 str = PyUnicode_FromObject(str);
3749 if (str == NULL)
3750 return -1;
3751 substr = PyUnicode_FromObject(substr);
3752 if (substr == NULL) {
3753 Py_DECREF(substr);
3754 return -1;
3755 }
Tim Petersced69f82003-09-16 20:30:58 +00003756
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757 result = tailmatch((PyUnicodeObject *)str,
3758 (PyUnicodeObject *)substr,
3759 start, end, direction);
3760 Py_DECREF(str);
3761 Py_DECREF(substr);
3762 return result;
3763}
3764
Tim Petersced69f82003-09-16 20:30:58 +00003765static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766const Py_UNICODE *findchar(const Py_UNICODE *s,
3767 int size,
3768 Py_UNICODE ch)
3769{
3770 /* like wcschr, but doesn't stop at NULL characters */
3771
3772 while (size-- > 0) {
3773 if (*s == ch)
3774 return s;
3775 s++;
3776 }
3777
3778 return NULL;
3779}
3780
3781/* Apply fixfct filter to the Unicode object self and return a
3782 reference to the modified object */
3783
Tim Petersced69f82003-09-16 20:30:58 +00003784static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785PyObject *fixup(PyUnicodeObject *self,
3786 int (*fixfct)(PyUnicodeObject *s))
3787{
3788
3789 PyUnicodeObject *u;
3790
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003791 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792 if (u == NULL)
3793 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003794
3795 Py_UNICODE_COPY(u->str, self->str, self->length);
3796
Tim Peters7a29bd52001-09-12 03:03:31 +00003797 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798 /* fixfct should return TRUE if it modified the buffer. If
3799 FALSE, return a reference to the original buffer instead
3800 (to save space, not time) */
3801 Py_INCREF(self);
3802 Py_DECREF(u);
3803 return (PyObject*) self;
3804 }
3805 return (PyObject*) u;
3806}
3807
Tim Petersced69f82003-09-16 20:30:58 +00003808static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809int fixupper(PyUnicodeObject *self)
3810{
3811 int len = self->length;
3812 Py_UNICODE *s = self->str;
3813 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003814
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 while (len-- > 0) {
3816 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003817
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 ch = Py_UNICODE_TOUPPER(*s);
3819 if (ch != *s) {
3820 status = 1;
3821 *s = ch;
3822 }
3823 s++;
3824 }
3825
3826 return status;
3827}
3828
Tim Petersced69f82003-09-16 20:30:58 +00003829static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830int fixlower(PyUnicodeObject *self)
3831{
3832 int len = self->length;
3833 Py_UNICODE *s = self->str;
3834 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003835
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836 while (len-- > 0) {
3837 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003838
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 ch = Py_UNICODE_TOLOWER(*s);
3840 if (ch != *s) {
3841 status = 1;
3842 *s = ch;
3843 }
3844 s++;
3845 }
3846
3847 return status;
3848}
3849
Tim Petersced69f82003-09-16 20:30:58 +00003850static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851int fixswapcase(PyUnicodeObject *self)
3852{
3853 int len = self->length;
3854 Py_UNICODE *s = self->str;
3855 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003856
Guido van Rossumd57fd912000-03-10 22:53:23 +00003857 while (len-- > 0) {
3858 if (Py_UNICODE_ISUPPER(*s)) {
3859 *s = Py_UNICODE_TOLOWER(*s);
3860 status = 1;
3861 } else if (Py_UNICODE_ISLOWER(*s)) {
3862 *s = Py_UNICODE_TOUPPER(*s);
3863 status = 1;
3864 }
3865 s++;
3866 }
3867
3868 return status;
3869}
3870
Tim Petersced69f82003-09-16 20:30:58 +00003871static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872int fixcapitalize(PyUnicodeObject *self)
3873{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003874 int len = self->length;
3875 Py_UNICODE *s = self->str;
3876 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003877
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003878 if (len == 0)
3879 return 0;
3880 if (Py_UNICODE_ISLOWER(*s)) {
3881 *s = Py_UNICODE_TOUPPER(*s);
3882 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003883 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003884 s++;
3885 while (--len > 0) {
3886 if (Py_UNICODE_ISUPPER(*s)) {
3887 *s = Py_UNICODE_TOLOWER(*s);
3888 status = 1;
3889 }
3890 s++;
3891 }
3892 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003893}
3894
3895static
3896int fixtitle(PyUnicodeObject *self)
3897{
3898 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3899 register Py_UNICODE *e;
3900 int previous_is_cased;
3901
3902 /* Shortcut for single character strings */
3903 if (PyUnicode_GET_SIZE(self) == 1) {
3904 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3905 if (*p != ch) {
3906 *p = ch;
3907 return 1;
3908 }
3909 else
3910 return 0;
3911 }
Tim Petersced69f82003-09-16 20:30:58 +00003912
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913 e = p + PyUnicode_GET_SIZE(self);
3914 previous_is_cased = 0;
3915 for (; p < e; p++) {
3916 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00003917
Guido van Rossumd57fd912000-03-10 22:53:23 +00003918 if (previous_is_cased)
3919 *p = Py_UNICODE_TOLOWER(ch);
3920 else
3921 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00003922
3923 if (Py_UNICODE_ISLOWER(ch) ||
3924 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925 Py_UNICODE_ISTITLE(ch))
3926 previous_is_cased = 1;
3927 else
3928 previous_is_cased = 0;
3929 }
3930 return 1;
3931}
3932
3933PyObject *PyUnicode_Join(PyObject *separator,
3934 PyObject *seq)
3935{
3936 Py_UNICODE *sep;
3937 int seplen;
3938 PyUnicodeObject *res = NULL;
3939 int reslen = 0;
3940 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941 int sz = 100;
3942 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003943 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944
Tim Peters2cfe3682001-05-05 05:36:48 +00003945 it = PyObject_GetIter(seq);
3946 if (it == NULL)
3947 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948
3949 if (separator == NULL) {
3950 Py_UNICODE blank = ' ';
3951 sep = &blank;
3952 seplen = 1;
3953 }
3954 else {
3955 separator = PyUnicode_FromObject(separator);
3956 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003957 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 sep = PyUnicode_AS_UNICODE(separator);
3959 seplen = PyUnicode_GET_SIZE(separator);
3960 }
Tim Petersced69f82003-09-16 20:30:58 +00003961
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962 res = _PyUnicode_New(sz);
3963 if (res == NULL)
3964 goto onError;
3965 p = PyUnicode_AS_UNICODE(res);
3966 reslen = 0;
3967
Tim Peters2cfe3682001-05-05 05:36:48 +00003968 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003970 PyObject *item = PyIter_Next(it);
3971 if (item == NULL) {
3972 if (PyErr_Occurred())
3973 goto onError;
3974 break;
3975 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003976 if (!PyUnicode_Check(item)) {
3977 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003978 if (!PyString_Check(item)) {
3979 PyErr_Format(PyExc_TypeError,
3980 "sequence item %i: expected string or Unicode,"
3981 " %.80s found",
3982 i, item->ob_type->tp_name);
3983 Py_DECREF(item);
3984 goto onError;
3985 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986 v = PyUnicode_FromObject(item);
3987 Py_DECREF(item);
3988 item = v;
3989 if (item == NULL)
3990 goto onError;
3991 }
3992 itemlen = PyUnicode_GET_SIZE(item);
3993 while (reslen + itemlen + seplen >= sz) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003994 if (_PyUnicode_Resize(&res, sz*2) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003995 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 sz *= 2;
3999 p = PyUnicode_AS_UNICODE(res) + reslen;
4000 }
4001 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004002 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003 p += seplen;
4004 reslen += seplen;
4005 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004006 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007 p += itemlen;
4008 reslen += itemlen;
4009 Py_DECREF(item);
4010 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004011 if (_PyUnicode_Resize(&res, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004012 goto onError;
4013
4014 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004015 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016 return (PyObject *)res;
4017
4018 onError:
4019 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004020 Py_XDECREF(res);
4021 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004022 return NULL;
4023}
4024
Tim Petersced69f82003-09-16 20:30:58 +00004025static
4026PyUnicodeObject *pad(PyUnicodeObject *self,
4027 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004028 int right,
4029 Py_UNICODE fill)
4030{
4031 PyUnicodeObject *u;
4032
4033 if (left < 0)
4034 left = 0;
4035 if (right < 0)
4036 right = 0;
4037
Tim Peters7a29bd52001-09-12 03:03:31 +00004038 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004039 Py_INCREF(self);
4040 return self;
4041 }
4042
4043 u = _PyUnicode_New(left + self->length + right);
4044 if (u) {
4045 if (left)
4046 Py_UNICODE_FILL(u->str, fill, left);
4047 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4048 if (right)
4049 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4050 }
4051
4052 return u;
4053}
4054
4055#define SPLIT_APPEND(data, left, right) \
4056 str = PyUnicode_FromUnicode(data + left, right - left); \
4057 if (!str) \
4058 goto onError; \
4059 if (PyList_Append(list, str)) { \
4060 Py_DECREF(str); \
4061 goto onError; \
4062 } \
4063 else \
4064 Py_DECREF(str);
4065
4066static
4067PyObject *split_whitespace(PyUnicodeObject *self,
4068 PyObject *list,
4069 int maxcount)
4070{
4071 register int i;
4072 register int j;
4073 int len = self->length;
4074 PyObject *str;
4075
4076 for (i = j = 0; i < len; ) {
4077 /* find a token */
4078 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4079 i++;
4080 j = i;
4081 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4082 i++;
4083 if (j < i) {
4084 if (maxcount-- <= 0)
4085 break;
4086 SPLIT_APPEND(self->str, j, i);
4087 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4088 i++;
4089 j = i;
4090 }
4091 }
4092 if (j < len) {
4093 SPLIT_APPEND(self->str, j, len);
4094 }
4095 return list;
4096
4097 onError:
4098 Py_DECREF(list);
4099 return NULL;
4100}
4101
4102PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004103 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104{
4105 register int i;
4106 register int j;
4107 int len;
4108 PyObject *list;
4109 PyObject *str;
4110 Py_UNICODE *data;
4111
4112 string = PyUnicode_FromObject(string);
4113 if (string == NULL)
4114 return NULL;
4115 data = PyUnicode_AS_UNICODE(string);
4116 len = PyUnicode_GET_SIZE(string);
4117
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118 list = PyList_New(0);
4119 if (!list)
4120 goto onError;
4121
4122 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004123 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004124
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 /* Find a line and append it */
4126 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4127 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128
4129 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004130 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131 if (i < len) {
4132 if (data[i] == '\r' && i + 1 < len &&
4133 data[i+1] == '\n')
4134 i += 2;
4135 else
4136 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004137 if (keepends)
4138 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139 }
Guido van Rossum86662912000-04-11 15:38:46 +00004140 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141 j = i;
4142 }
4143 if (j < len) {
4144 SPLIT_APPEND(data, j, len);
4145 }
4146
4147 Py_DECREF(string);
4148 return list;
4149
4150 onError:
4151 Py_DECREF(list);
4152 Py_DECREF(string);
4153 return NULL;
4154}
4155
Tim Petersced69f82003-09-16 20:30:58 +00004156static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004157PyObject *split_char(PyUnicodeObject *self,
4158 PyObject *list,
4159 Py_UNICODE ch,
4160 int maxcount)
4161{
4162 register int i;
4163 register int j;
4164 int len = self->length;
4165 PyObject *str;
4166
4167 for (i = j = 0; i < len; ) {
4168 if (self->str[i] == ch) {
4169 if (maxcount-- <= 0)
4170 break;
4171 SPLIT_APPEND(self->str, j, i);
4172 i = j = i + 1;
4173 } else
4174 i++;
4175 }
4176 if (j <= len) {
4177 SPLIT_APPEND(self->str, j, len);
4178 }
4179 return list;
4180
4181 onError:
4182 Py_DECREF(list);
4183 return NULL;
4184}
4185
Tim Petersced69f82003-09-16 20:30:58 +00004186static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004187PyObject *split_substring(PyUnicodeObject *self,
4188 PyObject *list,
4189 PyUnicodeObject *substring,
4190 int maxcount)
4191{
4192 register int i;
4193 register int j;
4194 int len = self->length;
4195 int sublen = substring->length;
4196 PyObject *str;
4197
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004198 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 if (Py_UNICODE_MATCH(self, i, substring)) {
4200 if (maxcount-- <= 0)
4201 break;
4202 SPLIT_APPEND(self->str, j, i);
4203 i = j = i + sublen;
4204 } else
4205 i++;
4206 }
4207 if (j <= len) {
4208 SPLIT_APPEND(self->str, j, len);
4209 }
4210 return list;
4211
4212 onError:
4213 Py_DECREF(list);
4214 return NULL;
4215}
4216
4217#undef SPLIT_APPEND
4218
4219static
4220PyObject *split(PyUnicodeObject *self,
4221 PyUnicodeObject *substring,
4222 int maxcount)
4223{
4224 PyObject *list;
4225
4226 if (maxcount < 0)
4227 maxcount = INT_MAX;
4228
4229 list = PyList_New(0);
4230 if (!list)
4231 return NULL;
4232
4233 if (substring == NULL)
4234 return split_whitespace(self,list,maxcount);
4235
4236 else if (substring->length == 1)
4237 return split_char(self,list,substring->str[0],maxcount);
4238
4239 else if (substring->length == 0) {
4240 Py_DECREF(list);
4241 PyErr_SetString(PyExc_ValueError, "empty separator");
4242 return NULL;
4243 }
4244 else
4245 return split_substring(self,list,substring,maxcount);
4246}
4247
Tim Petersced69f82003-09-16 20:30:58 +00004248static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249PyObject *replace(PyUnicodeObject *self,
4250 PyUnicodeObject *str1,
4251 PyUnicodeObject *str2,
4252 int maxcount)
4253{
4254 PyUnicodeObject *u;
4255
4256 if (maxcount < 0)
4257 maxcount = INT_MAX;
4258
4259 if (str1->length == 1 && str2->length == 1) {
4260 int i;
4261
4262 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004263 if (!findchar(self->str, self->length, str1->str[0]) &&
4264 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004265 /* nothing to replace, return original string */
4266 Py_INCREF(self);
4267 u = self;
4268 } else {
4269 Py_UNICODE u1 = str1->str[0];
4270 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004271
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004273 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004274 self->length
4275 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004276 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004277 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004278 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279 for (i = 0; i < u->length; i++)
4280 if (u->str[i] == u1) {
4281 if (--maxcount < 0)
4282 break;
4283 u->str[i] = u2;
4284 }
4285 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004286 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004287
4288 } else {
4289 int n, i;
4290 Py_UNICODE *p;
4291
4292 /* replace strings */
4293 n = count(self, 0, self->length, str1);
4294 if (n > maxcount)
4295 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004296 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004298 if (PyUnicode_CheckExact(self)) {
4299 Py_INCREF(self);
4300 u = self;
4301 }
4302 else {
4303 u = (PyUnicodeObject *)
4304 PyUnicode_FromUnicode(self->str, self->length);
4305 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306 } else {
4307 u = _PyUnicode_New(
4308 self->length + n * (str2->length - str1->length));
4309 if (u) {
4310 i = 0;
4311 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004312 if (str1->length > 0) {
4313 while (i <= self->length - str1->length)
4314 if (Py_UNICODE_MATCH(self, i, str1)) {
4315 /* replace string segment */
4316 Py_UNICODE_COPY(p, str2->str, str2->length);
4317 p += str2->length;
4318 i += str1->length;
4319 if (--n <= 0) {
4320 /* copy remaining part */
4321 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4322 break;
4323 }
4324 } else
4325 *p++ = self->str[i++];
4326 } else {
4327 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328 Py_UNICODE_COPY(p, str2->str, str2->length);
4329 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004330 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004333 }
4334 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4335 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336 }
4337 }
4338 }
Tim Petersced69f82003-09-16 20:30:58 +00004339
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340 return (PyObject *) u;
4341}
4342
4343/* --- Unicode Object Methods --------------------------------------------- */
4344
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004345PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004346"S.title() -> unicode\n\
4347\n\
4348Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004349characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004350
4351static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004352unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 return fixup(self, fixtitle);
4355}
4356
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004357PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358"S.capitalize() -> unicode\n\
4359\n\
4360Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004361have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362
4363static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004364unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004366 return fixup(self, fixcapitalize);
4367}
4368
4369#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004370PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004371"S.capwords() -> unicode\n\
4372\n\
4373Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004374normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004375
4376static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004377unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378{
4379 PyObject *list;
4380 PyObject *item;
4381 int i;
4382
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383 /* Split into words */
4384 list = split(self, NULL, -1);
4385 if (!list)
4386 return NULL;
4387
4388 /* Capitalize each word */
4389 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4390 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4391 fixcapitalize);
4392 if (item == NULL)
4393 goto onError;
4394 Py_DECREF(PyList_GET_ITEM(list, i));
4395 PyList_SET_ITEM(list, i, item);
4396 }
4397
4398 /* Join the words to form a new string */
4399 item = PyUnicode_Join(NULL, list);
4400
4401onError:
4402 Py_DECREF(list);
4403 return (PyObject *)item;
4404}
4405#endif
4406
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004407PyDoc_STRVAR(center__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408"S.center(width) -> unicode\n\
4409\n\
4410Return S centered in a Unicode string of length width. Padding is done\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004411using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412
4413static PyObject *
4414unicode_center(PyUnicodeObject *self, PyObject *args)
4415{
4416 int marg, left;
4417 int width;
4418
4419 if (!PyArg_ParseTuple(args, "i:center", &width))
4420 return NULL;
4421
Tim Peters7a29bd52001-09-12 03:03:31 +00004422 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423 Py_INCREF(self);
4424 return (PyObject*) self;
4425 }
4426
4427 marg = width - self->length;
4428 left = marg / 2 + (marg & width & 1);
4429
4430 return (PyObject*) pad(self, left, marg - left, ' ');
4431}
4432
Marc-André Lemburge5034372000-08-08 08:04:29 +00004433#if 0
4434
4435/* This code should go into some future Unicode collation support
4436 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004437 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004438
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004439/* speedy UTF-16 code point order comparison */
4440/* gleaned from: */
4441/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4442
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004443static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004444{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004445 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004446 0, 0, 0, 0, 0, 0, 0, 0,
4447 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004448 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004449};
4450
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451static int
4452unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4453{
4454 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004455
Guido van Rossumd57fd912000-03-10 22:53:23 +00004456 Py_UNICODE *s1 = str1->str;
4457 Py_UNICODE *s2 = str2->str;
4458
4459 len1 = str1->length;
4460 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004461
Guido van Rossumd57fd912000-03-10 22:53:23 +00004462 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004463 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004464
4465 c1 = *s1++;
4466 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004467
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004468 if (c1 > (1<<11) * 26)
4469 c1 += utf16Fixup[c1>>11];
4470 if (c2 > (1<<11) * 26)
4471 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004472 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004473
4474 if (c1 != c2)
4475 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004476
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004477 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478 }
4479
4480 return (len1 < len2) ? -1 : (len1 != len2);
4481}
4482
Marc-André Lemburge5034372000-08-08 08:04:29 +00004483#else
4484
4485static int
4486unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4487{
4488 register int len1, len2;
4489
4490 Py_UNICODE *s1 = str1->str;
4491 Py_UNICODE *s2 = str2->str;
4492
4493 len1 = str1->length;
4494 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004495
Marc-André Lemburge5034372000-08-08 08:04:29 +00004496 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004497 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004498
Fredrik Lundh45714e92001-06-26 16:39:36 +00004499 c1 = *s1++;
4500 c2 = *s2++;
4501
4502 if (c1 != c2)
4503 return (c1 < c2) ? -1 : 1;
4504
Marc-André Lemburge5034372000-08-08 08:04:29 +00004505 len1--; len2--;
4506 }
4507
4508 return (len1 < len2) ? -1 : (len1 != len2);
4509}
4510
4511#endif
4512
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513int PyUnicode_Compare(PyObject *left,
4514 PyObject *right)
4515{
4516 PyUnicodeObject *u = NULL, *v = NULL;
4517 int result;
4518
4519 /* Coerce the two arguments */
4520 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4521 if (u == NULL)
4522 goto onError;
4523 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4524 if (v == NULL)
4525 goto onError;
4526
Thomas Wouters7e474022000-07-16 12:04:32 +00004527 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528 if (v == u) {
4529 Py_DECREF(u);
4530 Py_DECREF(v);
4531 return 0;
4532 }
4533
4534 result = unicode_compare(u, v);
4535
4536 Py_DECREF(u);
4537 Py_DECREF(v);
4538 return result;
4539
4540onError:
4541 Py_XDECREF(u);
4542 Py_XDECREF(v);
4543 return -1;
4544}
4545
Guido van Rossum403d68b2000-03-13 15:55:09 +00004546int PyUnicode_Contains(PyObject *container,
4547 PyObject *element)
4548{
4549 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004550 int result, size;
4551 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004552
4553 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004554 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004555 if (v == NULL) {
4556 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004557 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004558 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004559 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004560 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004561 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004562 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004563
Barry Warsaw817918c2002-08-06 16:58:21 +00004564 size = PyUnicode_GET_SIZE(v);
4565 rhs = PyUnicode_AS_UNICODE(v);
4566 lhs = PyUnicode_AS_UNICODE(u);
4567
Guido van Rossum403d68b2000-03-13 15:55:09 +00004568 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004569 if (size == 1) {
4570 end = lhs + PyUnicode_GET_SIZE(u);
4571 while (lhs < end) {
4572 if (*lhs++ == *rhs) {
4573 result = 1;
4574 break;
4575 }
4576 }
4577 }
4578 else {
4579 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4580 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004581 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004582 result = 1;
4583 break;
4584 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004585 }
4586 }
4587
4588 Py_DECREF(u);
4589 Py_DECREF(v);
4590 return result;
4591
4592onError:
4593 Py_XDECREF(u);
4594 Py_XDECREF(v);
4595 return -1;
4596}
4597
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598/* Concat to string or Unicode object giving a new Unicode object. */
4599
4600PyObject *PyUnicode_Concat(PyObject *left,
4601 PyObject *right)
4602{
4603 PyUnicodeObject *u = NULL, *v = NULL, *w;
4604
4605 /* Coerce the two arguments */
4606 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4607 if (u == NULL)
4608 goto onError;
4609 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4610 if (v == NULL)
4611 goto onError;
4612
4613 /* Shortcuts */
4614 if (v == unicode_empty) {
4615 Py_DECREF(v);
4616 return (PyObject *)u;
4617 }
4618 if (u == unicode_empty) {
4619 Py_DECREF(u);
4620 return (PyObject *)v;
4621 }
4622
4623 /* Concat the two Unicode strings */
4624 w = _PyUnicode_New(u->length + v->length);
4625 if (w == NULL)
4626 goto onError;
4627 Py_UNICODE_COPY(w->str, u->str, u->length);
4628 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4629
4630 Py_DECREF(u);
4631 Py_DECREF(v);
4632 return (PyObject *)w;
4633
4634onError:
4635 Py_XDECREF(u);
4636 Py_XDECREF(v);
4637 return NULL;
4638}
4639
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004640PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004641"S.count(sub[, start[, end]]) -> int\n\
4642\n\
4643Return the number of occurrences of substring sub in Unicode string\n\
4644S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004645interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646
4647static PyObject *
4648unicode_count(PyUnicodeObject *self, PyObject *args)
4649{
4650 PyUnicodeObject *substring;
4651 int start = 0;
4652 int end = INT_MAX;
4653 PyObject *result;
4654
Guido van Rossumb8872e62000-05-09 14:14:27 +00004655 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4656 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657 return NULL;
4658
4659 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4660 (PyObject *)substring);
4661 if (substring == NULL)
4662 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004663
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664 if (start < 0)
4665 start += self->length;
4666 if (start < 0)
4667 start = 0;
4668 if (end > self->length)
4669 end = self->length;
4670 if (end < 0)
4671 end += self->length;
4672 if (end < 0)
4673 end = 0;
4674
4675 result = PyInt_FromLong((long) count(self, start, end, substring));
4676
4677 Py_DECREF(substring);
4678 return result;
4679}
4680
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004681PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682"S.encode([encoding[,errors]]) -> string\n\
4683\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004684Return an encoded string version of S. Default encoding is the current\n\
4685default string encoding. errors may be given to set a different error\n\
4686handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004687a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4688'xmlcharrefreplace' as well as any other name registered with\n\
4689codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004690
4691static PyObject *
4692unicode_encode(PyUnicodeObject *self, PyObject *args)
4693{
4694 char *encoding = NULL;
4695 char *errors = NULL;
4696 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4697 return NULL;
4698 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4699}
4700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004701PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702"S.expandtabs([tabsize]) -> unicode\n\
4703\n\
4704Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004705If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706
4707static PyObject*
4708unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4709{
4710 Py_UNICODE *e;
4711 Py_UNICODE *p;
4712 Py_UNICODE *q;
4713 int i, j;
4714 PyUnicodeObject *u;
4715 int tabsize = 8;
4716
4717 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4718 return NULL;
4719
Thomas Wouters7e474022000-07-16 12:04:32 +00004720 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721 i = j = 0;
4722 e = self->str + self->length;
4723 for (p = self->str; p < e; p++)
4724 if (*p == '\t') {
4725 if (tabsize > 0)
4726 j += tabsize - (j % tabsize);
4727 }
4728 else {
4729 j++;
4730 if (*p == '\n' || *p == '\r') {
4731 i += j;
4732 j = 0;
4733 }
4734 }
4735
4736 /* Second pass: create output string and fill it */
4737 u = _PyUnicode_New(i + j);
4738 if (!u)
4739 return NULL;
4740
4741 j = 0;
4742 q = u->str;
4743
4744 for (p = self->str; p < e; p++)
4745 if (*p == '\t') {
4746 if (tabsize > 0) {
4747 i = tabsize - (j % tabsize);
4748 j += i;
4749 while (i--)
4750 *q++ = ' ';
4751 }
4752 }
4753 else {
4754 j++;
4755 *q++ = *p;
4756 if (*p == '\n' || *p == '\r')
4757 j = 0;
4758 }
4759
4760 return (PyObject*) u;
4761}
4762
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004763PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764"S.find(sub [,start [,end]]) -> int\n\
4765\n\
4766Return the lowest index in S where substring sub is found,\n\
4767such that sub is contained within s[start,end]. Optional\n\
4768arguments start and end are interpreted as in slice notation.\n\
4769\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004770Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771
4772static PyObject *
4773unicode_find(PyUnicodeObject *self, PyObject *args)
4774{
4775 PyUnicodeObject *substring;
4776 int start = 0;
4777 int end = INT_MAX;
4778 PyObject *result;
4779
Guido van Rossumb8872e62000-05-09 14:14:27 +00004780 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4781 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782 return NULL;
4783 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4784 (PyObject *)substring);
4785 if (substring == NULL)
4786 return NULL;
4787
4788 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4789
4790 Py_DECREF(substring);
4791 return result;
4792}
4793
4794static PyObject *
4795unicode_getitem(PyUnicodeObject *self, int index)
4796{
4797 if (index < 0 || index >= self->length) {
4798 PyErr_SetString(PyExc_IndexError, "string index out of range");
4799 return NULL;
4800 }
4801
4802 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4803}
4804
4805static long
4806unicode_hash(PyUnicodeObject *self)
4807{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004808 /* Since Unicode objects compare equal to their ASCII string
4809 counterparts, they should use the individual character values
4810 as basis for their hash value. This is needed to assure that
4811 strings and Unicode objects behave in the same way as
4812 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813
Fredrik Lundhdde61642000-07-10 18:27:47 +00004814 register int len;
4815 register Py_UNICODE *p;
4816 register long x;
4817
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818 if (self->hash != -1)
4819 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004820 len = PyUnicode_GET_SIZE(self);
4821 p = PyUnicode_AS_UNICODE(self);
4822 x = *p << 7;
4823 while (--len >= 0)
4824 x = (1000003*x) ^ *p++;
4825 x ^= PyUnicode_GET_SIZE(self);
4826 if (x == -1)
4827 x = -2;
4828 self->hash = x;
4829 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830}
4831
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004832PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833"S.index(sub [,start [,end]]) -> int\n\
4834\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004835Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836
4837static PyObject *
4838unicode_index(PyUnicodeObject *self, PyObject *args)
4839{
4840 int result;
4841 PyUnicodeObject *substring;
4842 int start = 0;
4843 int end = INT_MAX;
4844
Guido van Rossumb8872e62000-05-09 14:14:27 +00004845 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4846 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004848
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4850 (PyObject *)substring);
4851 if (substring == NULL)
4852 return NULL;
4853
4854 result = findstring(self, substring, start, end, 1);
4855
4856 Py_DECREF(substring);
4857 if (result < 0) {
4858 PyErr_SetString(PyExc_ValueError, "substring not found");
4859 return NULL;
4860 }
4861 return PyInt_FromLong(result);
4862}
4863
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004864PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004865"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004867Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004868at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869
4870static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004871unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872{
4873 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4874 register const Py_UNICODE *e;
4875 int cased;
4876
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 /* Shortcut for single character strings */
4878 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004879 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004881 /* Special case for empty strings */
4882 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004883 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004884
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885 e = p + PyUnicode_GET_SIZE(self);
4886 cased = 0;
4887 for (; p < e; p++) {
4888 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004889
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004891 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 else if (!cased && Py_UNICODE_ISLOWER(ch))
4893 cased = 1;
4894 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004895 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896}
4897
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004898PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004899"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00004901Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004902at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903
4904static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004905unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004906{
4907 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4908 register const Py_UNICODE *e;
4909 int cased;
4910
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911 /* Shortcut for single character strings */
4912 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004913 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004915 /* Special case for empty strings */
4916 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004917 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004918
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919 e = p + PyUnicode_GET_SIZE(self);
4920 cased = 0;
4921 for (; p < e; p++) {
4922 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004923
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004925 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926 else if (!cased && Py_UNICODE_ISUPPER(ch))
4927 cased = 1;
4928 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004929 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930}
4931
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004932PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004933"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00004935Return True if S is a titlecased string and there is at least one\n\
4936character in S, i.e. upper- and titlecase characters may only\n\
4937follow uncased characters and lowercase characters only cased ones.\n\
4938Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939
4940static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004941unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004942{
4943 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4944 register const Py_UNICODE *e;
4945 int cased, previous_is_cased;
4946
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947 /* Shortcut for single character strings */
4948 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004949 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4950 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004951
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004952 /* Special case for empty strings */
4953 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004954 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004955
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956 e = p + PyUnicode_GET_SIZE(self);
4957 cased = 0;
4958 previous_is_cased = 0;
4959 for (; p < e; p++) {
4960 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004961
Guido van Rossumd57fd912000-03-10 22:53:23 +00004962 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4963 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004964 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965 previous_is_cased = 1;
4966 cased = 1;
4967 }
4968 else if (Py_UNICODE_ISLOWER(ch)) {
4969 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004970 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004971 previous_is_cased = 1;
4972 cased = 1;
4973 }
4974 else
4975 previous_is_cased = 0;
4976 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004977 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004978}
4979
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004980PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004981"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004982\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00004983Return True if all characters in S are whitespace\n\
4984and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004985
4986static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004987unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988{
4989 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4990 register const Py_UNICODE *e;
4991
Guido van Rossumd57fd912000-03-10 22:53:23 +00004992 /* Shortcut for single character strings */
4993 if (PyUnicode_GET_SIZE(self) == 1 &&
4994 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004995 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004996
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004997 /* Special case for empty strings */
4998 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004999 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005000
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001 e = p + PyUnicode_GET_SIZE(self);
5002 for (; p < e; p++) {
5003 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005004 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005005 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005006 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005007}
5008
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005009PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005010"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005011\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005012Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005013and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005014
5015static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005016unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005017{
5018 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5019 register const Py_UNICODE *e;
5020
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005021 /* Shortcut for single character strings */
5022 if (PyUnicode_GET_SIZE(self) == 1 &&
5023 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005024 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005025
5026 /* Special case for empty strings */
5027 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005028 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005029
5030 e = p + PyUnicode_GET_SIZE(self);
5031 for (; p < e; p++) {
5032 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005033 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005034 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005035 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005036}
5037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005038PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005039"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005040\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005041Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005042and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005043
5044static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005045unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005046{
5047 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5048 register const Py_UNICODE *e;
5049
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005050 /* Shortcut for single character strings */
5051 if (PyUnicode_GET_SIZE(self) == 1 &&
5052 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005053 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005054
5055 /* Special case for empty strings */
5056 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005057 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005058
5059 e = p + PyUnicode_GET_SIZE(self);
5060 for (; p < e; p++) {
5061 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005062 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005063 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005064 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005065}
5066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005067PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005068"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005069\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005070Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005071False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072
5073static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005074unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075{
5076 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5077 register const Py_UNICODE *e;
5078
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079 /* Shortcut for single character strings */
5080 if (PyUnicode_GET_SIZE(self) == 1 &&
5081 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005082 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005084 /* Special case for empty strings */
5085 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005086 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005087
Guido van Rossumd57fd912000-03-10 22:53:23 +00005088 e = p + PyUnicode_GET_SIZE(self);
5089 for (; p < e; p++) {
5090 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005091 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005093 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005094}
5095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005096PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005097"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005099Return True if all characters in S are digits\n\
5100and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101
5102static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005103unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104{
5105 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5106 register const Py_UNICODE *e;
5107
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108 /* Shortcut for single character strings */
5109 if (PyUnicode_GET_SIZE(self) == 1 &&
5110 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005111 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005113 /* Special case for empty strings */
5114 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005115 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005116
Guido van Rossumd57fd912000-03-10 22:53:23 +00005117 e = p + PyUnicode_GET_SIZE(self);
5118 for (; p < e; p++) {
5119 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005120 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005122 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123}
5124
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005125PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005126"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005128Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005129False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130
5131static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005132unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133{
5134 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5135 register const Py_UNICODE *e;
5136
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 /* Shortcut for single character strings */
5138 if (PyUnicode_GET_SIZE(self) == 1 &&
5139 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005140 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005142 /* Special case for empty strings */
5143 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005144 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005145
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146 e = p + PyUnicode_GET_SIZE(self);
5147 for (; p < e; p++) {
5148 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005149 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005151 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152}
5153
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005154PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155"S.join(sequence) -> unicode\n\
5156\n\
5157Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005158sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159
5160static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005161unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005163 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164}
5165
5166static int
5167unicode_length(PyUnicodeObject *self)
5168{
5169 return self->length;
5170}
5171
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005172PyDoc_STRVAR(ljust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173"S.ljust(width) -> unicode\n\
5174\n\
5175Return S left justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005176done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177
5178static PyObject *
5179unicode_ljust(PyUnicodeObject *self, PyObject *args)
5180{
5181 int width;
5182 if (!PyArg_ParseTuple(args, "i:ljust", &width))
5183 return NULL;
5184
Tim Peters7a29bd52001-09-12 03:03:31 +00005185 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186 Py_INCREF(self);
5187 return (PyObject*) self;
5188 }
5189
5190 return (PyObject*) pad(self, 0, width - self->length, ' ');
5191}
5192
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005193PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194"S.lower() -> unicode\n\
5195\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005196Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197
5198static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005199unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201 return fixup(self, fixlower);
5202}
5203
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005204#define LEFTSTRIP 0
5205#define RIGHTSTRIP 1
5206#define BOTHSTRIP 2
5207
5208/* Arrays indexed by above */
5209static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5210
5211#define STRIPNAME(i) (stripformat[i]+3)
5212
5213static const Py_UNICODE *
5214unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5215{
Tim Peters030a5ce2002-04-22 19:00:10 +00005216 size_t i;
5217 for (i = 0; i < n; ++i)
5218 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005219 return s+i;
5220 return NULL;
5221}
5222
5223/* externally visible for str.strip(unicode) */
5224PyObject *
5225_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5226{
5227 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5228 int len = PyUnicode_GET_SIZE(self);
5229 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5230 int seplen = PyUnicode_GET_SIZE(sepobj);
5231 int i, j;
5232
5233 i = 0;
5234 if (striptype != RIGHTSTRIP) {
5235 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5236 i++;
5237 }
5238 }
5239
5240 j = len;
5241 if (striptype != LEFTSTRIP) {
5242 do {
5243 j--;
5244 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5245 j++;
5246 }
5247
5248 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5249 Py_INCREF(self);
5250 return (PyObject*)self;
5251 }
5252 else
5253 return PyUnicode_FromUnicode(s+i, j-i);
5254}
5255
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256
5257static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005258do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005260 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5261 int len = PyUnicode_GET_SIZE(self), i, j;
5262
5263 i = 0;
5264 if (striptype != RIGHTSTRIP) {
5265 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5266 i++;
5267 }
5268 }
5269
5270 j = len;
5271 if (striptype != LEFTSTRIP) {
5272 do {
5273 j--;
5274 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5275 j++;
5276 }
5277
5278 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5279 Py_INCREF(self);
5280 return (PyObject*)self;
5281 }
5282 else
5283 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284}
5285
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005286
5287static PyObject *
5288do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5289{
5290 PyObject *sep = NULL;
5291
5292 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5293 return NULL;
5294
5295 if (sep != NULL && sep != Py_None) {
5296 if (PyUnicode_Check(sep))
5297 return _PyUnicode_XStrip(self, striptype, sep);
5298 else if (PyString_Check(sep)) {
5299 PyObject *res;
5300 sep = PyUnicode_FromObject(sep);
5301 if (sep==NULL)
5302 return NULL;
5303 res = _PyUnicode_XStrip(self, striptype, sep);
5304 Py_DECREF(sep);
5305 return res;
5306 }
5307 else {
5308 PyErr_Format(PyExc_TypeError,
5309 "%s arg must be None, unicode or str",
5310 STRIPNAME(striptype));
5311 return NULL;
5312 }
5313 }
5314
5315 return do_strip(self, striptype);
5316}
5317
5318
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005319PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005320"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005321\n\
5322Return a copy of the string S with leading and trailing\n\
5323whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005324If chars is given and not None, remove characters in chars instead.\n\
5325If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005326
5327static PyObject *
5328unicode_strip(PyUnicodeObject *self, PyObject *args)
5329{
5330 if (PyTuple_GET_SIZE(args) == 0)
5331 return do_strip(self, BOTHSTRIP); /* Common case */
5332 else
5333 return do_argstrip(self, BOTHSTRIP, args);
5334}
5335
5336
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005337PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005338"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005339\n\
5340Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005341If chars is given and not None, remove characters in chars instead.\n\
5342If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005343
5344static PyObject *
5345unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5346{
5347 if (PyTuple_GET_SIZE(args) == 0)
5348 return do_strip(self, LEFTSTRIP); /* Common case */
5349 else
5350 return do_argstrip(self, LEFTSTRIP, args);
5351}
5352
5353
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005354PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005355"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005356\n\
5357Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005358If chars is given and not None, remove characters in chars instead.\n\
5359If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005360
5361static PyObject *
5362unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5363{
5364 if (PyTuple_GET_SIZE(args) == 0)
5365 return do_strip(self, RIGHTSTRIP); /* Common case */
5366 else
5367 return do_argstrip(self, RIGHTSTRIP, args);
5368}
5369
5370
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371static PyObject*
5372unicode_repeat(PyUnicodeObject *str, int len)
5373{
5374 PyUnicodeObject *u;
5375 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005376 int nchars;
5377 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378
5379 if (len < 0)
5380 len = 0;
5381
Tim Peters7a29bd52001-09-12 03:03:31 +00005382 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 /* no repeat, return original string */
5384 Py_INCREF(str);
5385 return (PyObject*) str;
5386 }
Tim Peters8f422462000-09-09 06:13:41 +00005387
5388 /* ensure # of chars needed doesn't overflow int and # of bytes
5389 * needed doesn't overflow size_t
5390 */
5391 nchars = len * str->length;
5392 if (len && nchars / len != str->length) {
5393 PyErr_SetString(PyExc_OverflowError,
5394 "repeated string is too long");
5395 return NULL;
5396 }
5397 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5398 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5399 PyErr_SetString(PyExc_OverflowError,
5400 "repeated string is too long");
5401 return NULL;
5402 }
5403 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404 if (!u)
5405 return NULL;
5406
5407 p = u->str;
5408
5409 while (len-- > 0) {
5410 Py_UNICODE_COPY(p, str->str, str->length);
5411 p += str->length;
5412 }
5413
5414 return (PyObject*) u;
5415}
5416
5417PyObject *PyUnicode_Replace(PyObject *obj,
5418 PyObject *subobj,
5419 PyObject *replobj,
5420 int maxcount)
5421{
5422 PyObject *self;
5423 PyObject *str1;
5424 PyObject *str2;
5425 PyObject *result;
5426
5427 self = PyUnicode_FromObject(obj);
5428 if (self == NULL)
5429 return NULL;
5430 str1 = PyUnicode_FromObject(subobj);
5431 if (str1 == NULL) {
5432 Py_DECREF(self);
5433 return NULL;
5434 }
5435 str2 = PyUnicode_FromObject(replobj);
5436 if (str2 == NULL) {
5437 Py_DECREF(self);
5438 Py_DECREF(str1);
5439 return NULL;
5440 }
Tim Petersced69f82003-09-16 20:30:58 +00005441 result = replace((PyUnicodeObject *)self,
5442 (PyUnicodeObject *)str1,
5443 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 maxcount);
5445 Py_DECREF(self);
5446 Py_DECREF(str1);
5447 Py_DECREF(str2);
5448 return result;
5449}
5450
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005451PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452"S.replace (old, new[, maxsplit]) -> unicode\n\
5453\n\
5454Return a copy of S with all occurrences of substring\n\
5455old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005456given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457
5458static PyObject*
5459unicode_replace(PyUnicodeObject *self, PyObject *args)
5460{
5461 PyUnicodeObject *str1;
5462 PyUnicodeObject *str2;
5463 int maxcount = -1;
5464 PyObject *result;
5465
5466 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5467 return NULL;
5468 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5469 if (str1 == NULL)
5470 return NULL;
5471 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005472 if (str2 == NULL) {
5473 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476
5477 result = replace(self, str1, str2, maxcount);
5478
5479 Py_DECREF(str1);
5480 Py_DECREF(str2);
5481 return result;
5482}
5483
5484static
5485PyObject *unicode_repr(PyObject *unicode)
5486{
5487 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5488 PyUnicode_GET_SIZE(unicode),
5489 1);
5490}
5491
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005492PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493"S.rfind(sub [,start [,end]]) -> int\n\
5494\n\
5495Return the highest index in S where substring sub is found,\n\
5496such that sub is contained within s[start,end]. Optional\n\
5497arguments start and end are interpreted as in slice notation.\n\
5498\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005499Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500
5501static PyObject *
5502unicode_rfind(PyUnicodeObject *self, PyObject *args)
5503{
5504 PyUnicodeObject *substring;
5505 int start = 0;
5506 int end = INT_MAX;
5507 PyObject *result;
5508
Guido van Rossumb8872e62000-05-09 14:14:27 +00005509 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5510 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 return NULL;
5512 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5513 (PyObject *)substring);
5514 if (substring == NULL)
5515 return NULL;
5516
5517 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5518
5519 Py_DECREF(substring);
5520 return result;
5521}
5522
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005523PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524"S.rindex(sub [,start [,end]]) -> int\n\
5525\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005526Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527
5528static PyObject *
5529unicode_rindex(PyUnicodeObject *self, PyObject *args)
5530{
5531 int result;
5532 PyUnicodeObject *substring;
5533 int start = 0;
5534 int end = INT_MAX;
5535
Guido van Rossumb8872e62000-05-09 14:14:27 +00005536 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5537 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 return NULL;
5539 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5540 (PyObject *)substring);
5541 if (substring == NULL)
5542 return NULL;
5543
5544 result = findstring(self, substring, start, end, -1);
5545
5546 Py_DECREF(substring);
5547 if (result < 0) {
5548 PyErr_SetString(PyExc_ValueError, "substring not found");
5549 return NULL;
5550 }
5551 return PyInt_FromLong(result);
5552}
5553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005554PyDoc_STRVAR(rjust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555"S.rjust(width) -> unicode\n\
5556\n\
5557Return S right justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005558done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559
5560static PyObject *
5561unicode_rjust(PyUnicodeObject *self, PyObject *args)
5562{
5563 int width;
5564 if (!PyArg_ParseTuple(args, "i:rjust", &width))
5565 return NULL;
5566
Tim Peters7a29bd52001-09-12 03:03:31 +00005567 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568 Py_INCREF(self);
5569 return (PyObject*) self;
5570 }
5571
5572 return (PyObject*) pad(self, width - self->length, 0, ' ');
5573}
5574
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575static PyObject*
5576unicode_slice(PyUnicodeObject *self, int start, int end)
5577{
5578 /* standard clamping */
5579 if (start < 0)
5580 start = 0;
5581 if (end < 0)
5582 end = 0;
5583 if (end > self->length)
5584 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005585 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 /* full slice, return original string */
5587 Py_INCREF(self);
5588 return (PyObject*) self;
5589 }
5590 if (start > end)
5591 start = end;
5592 /* copy slice */
5593 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5594 end - start);
5595}
5596
5597PyObject *PyUnicode_Split(PyObject *s,
5598 PyObject *sep,
5599 int maxsplit)
5600{
5601 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005602
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 s = PyUnicode_FromObject(s);
5604 if (s == NULL)
5605 return NULL;
5606 if (sep != NULL) {
5607 sep = PyUnicode_FromObject(sep);
5608 if (sep == NULL) {
5609 Py_DECREF(s);
5610 return NULL;
5611 }
5612 }
5613
5614 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5615
5616 Py_DECREF(s);
5617 Py_XDECREF(sep);
5618 return result;
5619}
5620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005621PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622"S.split([sep [,maxsplit]]) -> list of strings\n\
5623\n\
5624Return a list of the words in S, using sep as the\n\
5625delimiter string. If maxsplit is given, at most maxsplit\n\
5626splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005627is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628
5629static PyObject*
5630unicode_split(PyUnicodeObject *self, PyObject *args)
5631{
5632 PyObject *substring = Py_None;
5633 int maxcount = -1;
5634
5635 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5636 return NULL;
5637
5638 if (substring == Py_None)
5639 return split(self, NULL, maxcount);
5640 else if (PyUnicode_Check(substring))
5641 return split(self, (PyUnicodeObject *)substring, maxcount);
5642 else
5643 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5644}
5645
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005646PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00005647"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648\n\
5649Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00005650Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005651is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652
5653static PyObject*
5654unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5655{
Guido van Rossum86662912000-04-11 15:38:46 +00005656 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657
Guido van Rossum86662912000-04-11 15:38:46 +00005658 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 return NULL;
5660
Guido van Rossum86662912000-04-11 15:38:46 +00005661 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662}
5663
5664static
5665PyObject *unicode_str(PyUnicodeObject *self)
5666{
Fred Drakee4315f52000-05-09 19:53:39 +00005667 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668}
5669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005670PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671"S.swapcase() -> unicode\n\
5672\n\
5673Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005674and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675
5676static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005677unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679 return fixup(self, fixswapcase);
5680}
5681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005682PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683"S.translate(table) -> unicode\n\
5684\n\
5685Return a copy of the string S, where all characters have been mapped\n\
5686through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00005687Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5688Unmapped characters are left untouched. Characters mapped to None\n\
5689are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690
5691static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005692unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693{
Tim Petersced69f82003-09-16 20:30:58 +00005694 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00005696 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 "ignore");
5698}
5699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005700PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701"S.upper() -> unicode\n\
5702\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005703Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704
5705static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005706unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 return fixup(self, fixupper);
5709}
5710
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005711PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712"S.zfill(width) -> unicode\n\
5713\n\
5714Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005715of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716
5717static PyObject *
5718unicode_zfill(PyUnicodeObject *self, PyObject *args)
5719{
5720 int fill;
5721 PyUnicodeObject *u;
5722
5723 int width;
5724 if (!PyArg_ParseTuple(args, "i:zfill", &width))
5725 return NULL;
5726
5727 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00005728 if (PyUnicode_CheckExact(self)) {
5729 Py_INCREF(self);
5730 return (PyObject*) self;
5731 }
5732 else
5733 return PyUnicode_FromUnicode(
5734 PyUnicode_AS_UNICODE(self),
5735 PyUnicode_GET_SIZE(self)
5736 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 }
5738
5739 fill = width - self->length;
5740
5741 u = pad(self, fill, 0, '0');
5742
Walter Dörwald068325e2002-04-15 13:36:47 +00005743 if (u == NULL)
5744 return NULL;
5745
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 if (u->str[fill] == '+' || u->str[fill] == '-') {
5747 /* move sign to beginning of string */
5748 u->str[0] = u->str[fill];
5749 u->str[fill] = '0';
5750 }
5751
5752 return (PyObject*) u;
5753}
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754
5755#if 0
5756static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005757unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 return PyInt_FromLong(unicode_freelist_size);
5760}
5761#endif
5762
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005763PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005764"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00005766Return True if S starts with the specified prefix, False otherwise.\n\
5767With optional start, test S beginning at that position.\n\
5768With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769
5770static PyObject *
5771unicode_startswith(PyUnicodeObject *self,
5772 PyObject *args)
5773{
5774 PyUnicodeObject *substring;
5775 int start = 0;
5776 int end = INT_MAX;
5777 PyObject *result;
5778
Guido van Rossumb8872e62000-05-09 14:14:27 +00005779 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5780 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781 return NULL;
5782 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5783 (PyObject *)substring);
5784 if (substring == NULL)
5785 return NULL;
5786
Guido van Rossum77f6a652002-04-03 22:41:51 +00005787 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788
5789 Py_DECREF(substring);
5790 return result;
5791}
5792
5793
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005794PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005795"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00005797Return True if S ends with the specified suffix, False otherwise.\n\
5798With optional start, test S beginning at that position.\n\
5799With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800
5801static PyObject *
5802unicode_endswith(PyUnicodeObject *self,
5803 PyObject *args)
5804{
5805 PyUnicodeObject *substring;
5806 int start = 0;
5807 int end = INT_MAX;
5808 PyObject *result;
5809
Guido van Rossumb8872e62000-05-09 14:14:27 +00005810 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5811 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812 return NULL;
5813 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5814 (PyObject *)substring);
5815 if (substring == NULL)
5816 return NULL;
5817
Guido van Rossum77f6a652002-04-03 22:41:51 +00005818 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819
5820 Py_DECREF(substring);
5821 return result;
5822}
5823
5824
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005825
5826static PyObject *
5827unicode_getnewargs(PyUnicodeObject *v)
5828{
5829 return Py_BuildValue("(u#)", v->str, v->length);
5830}
5831
5832
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833static PyMethodDef unicode_methods[] = {
5834
5835 /* Order is according to common usage: often used methods should
5836 appear first, since lookup is done sequentially. */
5837
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005838 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5839 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5840 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5841 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5842 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5843 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5844 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5845 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5846 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5847 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5848 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5849 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5850 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005851 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005852/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5853 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5854 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5855 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005856 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005857 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005858 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005859 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5860 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5861 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5862 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5863 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5864 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5865 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5866 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5867 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5868 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5869 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5870 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5871 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5872 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005873 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00005874#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005875 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876#endif
5877
5878#if 0
5879 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005880 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881#endif
5882
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005883 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 {NULL, NULL}
5885};
5886
Neil Schemenauerce30bc92002-11-18 16:10:18 +00005887static PyObject *
5888unicode_mod(PyObject *v, PyObject *w)
5889{
5890 if (!PyUnicode_Check(v)) {
5891 Py_INCREF(Py_NotImplemented);
5892 return Py_NotImplemented;
5893 }
5894 return PyUnicode_Format(v, w);
5895}
5896
5897static PyNumberMethods unicode_as_number = {
5898 0, /*nb_add*/
5899 0, /*nb_subtract*/
5900 0, /*nb_multiply*/
5901 0, /*nb_divide*/
5902 unicode_mod, /*nb_remainder*/
5903};
5904
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905static PySequenceMethods unicode_as_sequence = {
5906 (inquiry) unicode_length, /* sq_length */
5907 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5908 (intargfunc) unicode_repeat, /* sq_repeat */
5909 (intargfunc) unicode_getitem, /* sq_item */
5910 (intintargfunc) unicode_slice, /* sq_slice */
5911 0, /* sq_ass_item */
5912 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005913 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914};
5915
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005916static PyObject*
5917unicode_subscript(PyUnicodeObject* self, PyObject* item)
5918{
5919 if (PyInt_Check(item)) {
5920 long i = PyInt_AS_LONG(item);
5921 if (i < 0)
5922 i += PyString_GET_SIZE(self);
5923 return unicode_getitem(self, i);
5924 } else if (PyLong_Check(item)) {
5925 long i = PyLong_AsLong(item);
5926 if (i == -1 && PyErr_Occurred())
5927 return NULL;
5928 if (i < 0)
5929 i += PyString_GET_SIZE(self);
5930 return unicode_getitem(self, i);
5931 } else if (PySlice_Check(item)) {
5932 int start, stop, step, slicelength, cur, i;
5933 Py_UNICODE* source_buf;
5934 Py_UNICODE* result_buf;
5935 PyObject* result;
5936
5937 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5938 &start, &stop, &step, &slicelength) < 0) {
5939 return NULL;
5940 }
5941
5942 if (slicelength <= 0) {
5943 return PyUnicode_FromUnicode(NULL, 0);
5944 } else {
5945 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5946 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5947
5948 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5949 result_buf[i] = source_buf[cur];
5950 }
Tim Petersced69f82003-09-16 20:30:58 +00005951
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005952 result = PyUnicode_FromUnicode(result_buf, slicelength);
5953 PyMem_FREE(result_buf);
5954 return result;
5955 }
5956 } else {
5957 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5958 return NULL;
5959 }
5960}
5961
5962static PyMappingMethods unicode_as_mapping = {
5963 (inquiry)unicode_length, /* mp_length */
5964 (binaryfunc)unicode_subscript, /* mp_subscript */
5965 (objobjargproc)0, /* mp_ass_subscript */
5966};
5967
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968static int
5969unicode_buffer_getreadbuf(PyUnicodeObject *self,
5970 int index,
5971 const void **ptr)
5972{
5973 if (index != 0) {
5974 PyErr_SetString(PyExc_SystemError,
5975 "accessing non-existent unicode segment");
5976 return -1;
5977 }
5978 *ptr = (void *) self->str;
5979 return PyUnicode_GET_DATA_SIZE(self);
5980}
5981
5982static int
5983unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5984 const void **ptr)
5985{
5986 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00005987 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 return -1;
5989}
5990
5991static int
5992unicode_buffer_getsegcount(PyUnicodeObject *self,
5993 int *lenp)
5994{
5995 if (lenp)
5996 *lenp = PyUnicode_GET_DATA_SIZE(self);
5997 return 1;
5998}
5999
6000static int
6001unicode_buffer_getcharbuf(PyUnicodeObject *self,
6002 int index,
6003 const void **ptr)
6004{
6005 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006006
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 if (index != 0) {
6008 PyErr_SetString(PyExc_SystemError,
6009 "accessing non-existent unicode segment");
6010 return -1;
6011 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006012 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 if (str == NULL)
6014 return -1;
6015 *ptr = (void *) PyString_AS_STRING(str);
6016 return PyString_GET_SIZE(str);
6017}
6018
6019/* Helpers for PyUnicode_Format() */
6020
6021static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006022getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023{
6024 int argidx = *p_argidx;
6025 if (argidx < arglen) {
6026 (*p_argidx)++;
6027 if (arglen < 0)
6028 return args;
6029 else
6030 return PyTuple_GetItem(args, argidx);
6031 }
6032 PyErr_SetString(PyExc_TypeError,
6033 "not enough arguments for format string");
6034 return NULL;
6035}
6036
6037#define F_LJUST (1<<0)
6038#define F_SIGN (1<<1)
6039#define F_BLANK (1<<2)
6040#define F_ALT (1<<3)
6041#define F_ZERO (1<<4)
6042
6043static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045{
6046 register int i;
6047 int len;
6048 va_list va;
6049 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051
6052 /* First, format the string as char array, then expand to Py_UNICODE
6053 array. */
6054 charbuffer = (char *)buffer;
6055 len = vsprintf(charbuffer, format, va);
6056 for (i = len - 1; i >= 0; i--)
6057 buffer[i] = (Py_UNICODE) charbuffer[i];
6058
6059 va_end(va);
6060 return len;
6061}
6062
Guido van Rossum078151d2002-08-11 04:24:12 +00006063/* XXX To save some code duplication, formatfloat/long/int could have been
6064 shared with stringobject.c, converting from 8-bit to Unicode after the
6065 formatting is done. */
6066
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067static int
6068formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006069 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 int flags,
6071 int prec,
6072 int type,
6073 PyObject *v)
6074{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006075 /* fmt = '%#.' + `prec` + `type`
6076 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 char fmt[20];
6078 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006079
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 x = PyFloat_AsDouble(v);
6081 if (x == -1.0 && PyErr_Occurred())
6082 return -1;
6083 if (prec < 0)
6084 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6086 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006087 /* Worst case length calc to ensure no buffer overrun:
6088
6089 'g' formats:
6090 fmt = %#.<prec>g
6091 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6092 for any double rep.)
6093 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6094
6095 'f' formats:
6096 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6097 len = 1 + 50 + 1 + prec = 52 + prec
6098
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006099 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006100 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006101
6102 */
6103 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6104 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006105 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006106 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006107 return -1;
6108 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006109 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6110 (flags&F_ALT) ? "#" : "",
6111 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 return usprintf(buf, fmt, x);
6113}
6114
Tim Peters38fd5b62000-09-21 05:43:11 +00006115static PyObject*
6116formatlong(PyObject *val, int flags, int prec, int type)
6117{
6118 char *buf;
6119 int i, len;
6120 PyObject *str; /* temporary string object. */
6121 PyUnicodeObject *result;
6122
6123 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6124 if (!str)
6125 return NULL;
6126 result = _PyUnicode_New(len);
6127 for (i = 0; i < len; i++)
6128 result->str[i] = buf[i];
6129 result->str[len] = 0;
6130 Py_DECREF(str);
6131 return (PyObject*)result;
6132}
6133
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134static int
6135formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006136 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137 int flags,
6138 int prec,
6139 int type,
6140 PyObject *v)
6141{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006142 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006143 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6144 * + 1 + 1
6145 * = 24
6146 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006147 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148 long x;
6149
6150 x = PyInt_AsLong(v);
6151 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006152 return -1;
Guido van Rossum078151d2002-08-11 04:24:12 +00006153 if (x < 0 && type != 'd' && type != 'i') {
Guido van Rossum54df53a2002-08-14 18:38:27 +00006154 if (PyErr_Warn(PyExc_FutureWarning,
Guido van Rossum078151d2002-08-11 04:24:12 +00006155 "%u/%o/%x/%X of negative int will return "
6156 "a signed string in Python 2.4 and up") < 0)
6157 return -1;
6158 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006160 prec = 1;
6161
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006162 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006163 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
6164 */
6165 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006166 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006167 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006168 return -1;
6169 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006170
6171 if ((flags & F_ALT) &&
6172 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006173 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006174 * of issues that cause pain:
6175 * - when 0 is being converted, the C standard leaves off
6176 * the '0x' or '0X', which is inconsistent with other
6177 * %#x/%#X conversions and inconsistent with Python's
6178 * hex() function
6179 * - there are platforms that violate the standard and
6180 * convert 0 with the '0x' or '0X'
6181 * (Metrowerks, Compaq Tru64)
6182 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006183 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006184 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006185 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006186 * We can achieve the desired consistency by inserting our
6187 * own '0x' or '0X' prefix, and substituting %x/%X in place
6188 * of %#x/%#X.
6189 *
6190 * Note that this is the same approach as used in
6191 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006192 */
Tim Petersced69f82003-09-16 20:30:58 +00006193 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006194 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006195 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006196 else {
6197 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
Tim Petersced69f82003-09-16 20:30:58 +00006198 (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006199 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006200 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 return usprintf(buf, fmt, x);
6202}
6203
6204static int
6205formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006206 size_t buflen,
6207 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006209 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006210 if (PyUnicode_Check(v)) {
6211 if (PyUnicode_GET_SIZE(v) != 1)
6212 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006214 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006216 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006217 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006218 goto onError;
6219 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6220 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221
6222 else {
6223 /* Integer input truncated to a character */
6224 long x;
6225 x = PyInt_AsLong(v);
6226 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006227 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006228#ifdef Py_UNICODE_WIDE
6229 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006230 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006231 "%c arg not in range(0x110000) "
6232 "(wide Python build)");
6233 return -1;
6234 }
6235#else
6236 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006237 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006238 "%c arg not in range(0x10000) "
6239 "(narrow Python build)");
6240 return -1;
6241 }
6242#endif
6243 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244 }
6245 buf[1] = '\0';
6246 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006247
6248 onError:
6249 PyErr_SetString(PyExc_TypeError,
6250 "%c requires int or char");
6251 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252}
6253
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006254/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6255
6256 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6257 chars are formatted. XXX This is a magic number. Each formatting
6258 routine does bounds checking to ensure no overflow, but a better
6259 solution may be to malloc a buffer of appropriate size for each
6260 format. For now, the current solution is sufficient.
6261*/
6262#define FORMATBUFLEN (size_t)120
6263
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264PyObject *PyUnicode_Format(PyObject *format,
6265 PyObject *args)
6266{
6267 Py_UNICODE *fmt, *res;
6268 int fmtcnt, rescnt, reslen, arglen, argidx;
6269 int args_owned = 0;
6270 PyUnicodeObject *result = NULL;
6271 PyObject *dict = NULL;
6272 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006273
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 if (format == NULL || args == NULL) {
6275 PyErr_BadInternalCall();
6276 return NULL;
6277 }
6278 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006279 if (uformat == NULL)
6280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 fmt = PyUnicode_AS_UNICODE(uformat);
6282 fmtcnt = PyUnicode_GET_SIZE(uformat);
6283
6284 reslen = rescnt = fmtcnt + 100;
6285 result = _PyUnicode_New(reslen);
6286 if (result == NULL)
6287 goto onError;
6288 res = PyUnicode_AS_UNICODE(result);
6289
6290 if (PyTuple_Check(args)) {
6291 arglen = PyTuple_Size(args);
6292 argidx = 0;
6293 }
6294 else {
6295 arglen = -1;
6296 argidx = -2;
6297 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006298 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6299 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 dict = args;
6301
6302 while (--fmtcnt >= 0) {
6303 if (*fmt != '%') {
6304 if (--rescnt < 0) {
6305 rescnt = fmtcnt + 100;
6306 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006307 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308 return NULL;
6309 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6310 --rescnt;
6311 }
6312 *res++ = *fmt++;
6313 }
6314 else {
6315 /* Got a format specifier */
6316 int flags = 0;
6317 int width = -1;
6318 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319 Py_UNICODE c = '\0';
6320 Py_UNICODE fill;
6321 PyObject *v = NULL;
6322 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006323 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324 Py_UNICODE sign;
6325 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006326 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327
6328 fmt++;
6329 if (*fmt == '(') {
6330 Py_UNICODE *keystart;
6331 int keylen;
6332 PyObject *key;
6333 int pcount = 1;
6334
6335 if (dict == NULL) {
6336 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006337 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338 goto onError;
6339 }
6340 ++fmt;
6341 --fmtcnt;
6342 keystart = fmt;
6343 /* Skip over balanced parentheses */
6344 while (pcount > 0 && --fmtcnt >= 0) {
6345 if (*fmt == ')')
6346 --pcount;
6347 else if (*fmt == '(')
6348 ++pcount;
6349 fmt++;
6350 }
6351 keylen = fmt - keystart - 1;
6352 if (fmtcnt < 0 || pcount > 0) {
6353 PyErr_SetString(PyExc_ValueError,
6354 "incomplete format key");
6355 goto onError;
6356 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006357#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006358 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359 then looked up since Python uses strings to hold
6360 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006361 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 key = PyUnicode_EncodeUTF8(keystart,
6363 keylen,
6364 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006365#else
6366 key = PyUnicode_FromUnicode(keystart, keylen);
6367#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 if (key == NULL)
6369 goto onError;
6370 if (args_owned) {
6371 Py_DECREF(args);
6372 args_owned = 0;
6373 }
6374 args = PyObject_GetItem(dict, key);
6375 Py_DECREF(key);
6376 if (args == NULL) {
6377 goto onError;
6378 }
6379 args_owned = 1;
6380 arglen = -1;
6381 argidx = -2;
6382 }
6383 while (--fmtcnt >= 0) {
6384 switch (c = *fmt++) {
6385 case '-': flags |= F_LJUST; continue;
6386 case '+': flags |= F_SIGN; continue;
6387 case ' ': flags |= F_BLANK; continue;
6388 case '#': flags |= F_ALT; continue;
6389 case '0': flags |= F_ZERO; continue;
6390 }
6391 break;
6392 }
6393 if (c == '*') {
6394 v = getnextarg(args, arglen, &argidx);
6395 if (v == NULL)
6396 goto onError;
6397 if (!PyInt_Check(v)) {
6398 PyErr_SetString(PyExc_TypeError,
6399 "* wants int");
6400 goto onError;
6401 }
6402 width = PyInt_AsLong(v);
6403 if (width < 0) {
6404 flags |= F_LJUST;
6405 width = -width;
6406 }
6407 if (--fmtcnt >= 0)
6408 c = *fmt++;
6409 }
6410 else if (c >= '0' && c <= '9') {
6411 width = c - '0';
6412 while (--fmtcnt >= 0) {
6413 c = *fmt++;
6414 if (c < '0' || c > '9')
6415 break;
6416 if ((width*10) / 10 != width) {
6417 PyErr_SetString(PyExc_ValueError,
6418 "width too big");
6419 goto onError;
6420 }
6421 width = width*10 + (c - '0');
6422 }
6423 }
6424 if (c == '.') {
6425 prec = 0;
6426 if (--fmtcnt >= 0)
6427 c = *fmt++;
6428 if (c == '*') {
6429 v = getnextarg(args, arglen, &argidx);
6430 if (v == NULL)
6431 goto onError;
6432 if (!PyInt_Check(v)) {
6433 PyErr_SetString(PyExc_TypeError,
6434 "* wants int");
6435 goto onError;
6436 }
6437 prec = PyInt_AsLong(v);
6438 if (prec < 0)
6439 prec = 0;
6440 if (--fmtcnt >= 0)
6441 c = *fmt++;
6442 }
6443 else if (c >= '0' && c <= '9') {
6444 prec = c - '0';
6445 while (--fmtcnt >= 0) {
6446 c = Py_CHARMASK(*fmt++);
6447 if (c < '0' || c > '9')
6448 break;
6449 if ((prec*10) / 10 != prec) {
6450 PyErr_SetString(PyExc_ValueError,
6451 "prec too big");
6452 goto onError;
6453 }
6454 prec = prec*10 + (c - '0');
6455 }
6456 }
6457 } /* prec */
6458 if (fmtcnt >= 0) {
6459 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 if (--fmtcnt >= 0)
6461 c = *fmt++;
6462 }
6463 }
6464 if (fmtcnt < 0) {
6465 PyErr_SetString(PyExc_ValueError,
6466 "incomplete format");
6467 goto onError;
6468 }
6469 if (c != '%') {
6470 v = getnextarg(args, arglen, &argidx);
6471 if (v == NULL)
6472 goto onError;
6473 }
6474 sign = 0;
6475 fill = ' ';
6476 switch (c) {
6477
6478 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006479 pbuf = formatbuf;
6480 /* presume that buffer length is at least 1 */
6481 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 len = 1;
6483 break;
6484
6485 case 's':
6486 case 'r':
6487 if (PyUnicode_Check(v) && c == 's') {
6488 temp = v;
6489 Py_INCREF(temp);
6490 }
6491 else {
6492 PyObject *unicode;
6493 if (c == 's')
6494 temp = PyObject_Str(v);
6495 else
6496 temp = PyObject_Repr(v);
6497 if (temp == NULL)
6498 goto onError;
6499 if (!PyString_Check(temp)) {
6500 /* XXX Note: this should never happen, since
6501 PyObject_Repr() and PyObject_Str() assure
6502 this */
6503 Py_DECREF(temp);
6504 PyErr_SetString(PyExc_TypeError,
6505 "%s argument has non-string str()");
6506 goto onError;
6507 }
Fred Drakee4315f52000-05-09 19:53:39 +00006508 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006510 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 "strict");
6512 Py_DECREF(temp);
6513 temp = unicode;
6514 if (temp == NULL)
6515 goto onError;
6516 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006517 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518 len = PyUnicode_GET_SIZE(temp);
6519 if (prec >= 0 && len > prec)
6520 len = prec;
6521 break;
6522
6523 case 'i':
6524 case 'd':
6525 case 'u':
6526 case 'o':
6527 case 'x':
6528 case 'X':
6529 if (c == 'i')
6530 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006531 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006532 temp = formatlong(v, flags, prec, c);
6533 if (!temp)
6534 goto onError;
6535 pbuf = PyUnicode_AS_UNICODE(temp);
6536 len = PyUnicode_GET_SIZE(temp);
6537 /* unbounded ints can always produce
6538 a sign character! */
6539 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006541 else {
6542 pbuf = formatbuf;
6543 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6544 flags, prec, c, v);
6545 if (len < 0)
6546 goto onError;
6547 /* only d conversion is signed */
6548 sign = c == 'd';
6549 }
6550 if (flags & F_ZERO)
6551 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 break;
6553
6554 case 'e':
6555 case 'E':
6556 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006557 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 case 'g':
6559 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006560 if (c == 'F')
6561 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006562 pbuf = formatbuf;
6563 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6564 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565 if (len < 0)
6566 goto onError;
6567 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006568 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 fill = '0';
6570 break;
6571
6572 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006573 pbuf = formatbuf;
6574 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 if (len < 0)
6576 goto onError;
6577 break;
6578
6579 default:
6580 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006581 "unsupported format character '%c' (0x%x) "
6582 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00006583 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006584 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006585 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 goto onError;
6587 }
6588 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006589 if (*pbuf == '-' || *pbuf == '+') {
6590 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 len--;
6592 }
6593 else if (flags & F_SIGN)
6594 sign = '+';
6595 else if (flags & F_BLANK)
6596 sign = ' ';
6597 else
6598 sign = 0;
6599 }
6600 if (width < len)
6601 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006602 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603 reslen -= rescnt;
6604 rescnt = width + fmtcnt + 100;
6605 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006606 if (reslen < 0) {
6607 Py_DECREF(result);
6608 return PyErr_NoMemory();
6609 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006610 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 return NULL;
6612 res = PyUnicode_AS_UNICODE(result)
6613 + reslen - rescnt;
6614 }
6615 if (sign) {
6616 if (fill != ' ')
6617 *res++ = sign;
6618 rescnt--;
6619 if (width > len)
6620 width--;
6621 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006622 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6623 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006624 assert(pbuf[1] == c);
6625 if (fill != ' ') {
6626 *res++ = *pbuf++;
6627 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00006628 }
Tim Petersfff53252001-04-12 18:38:48 +00006629 rescnt -= 2;
6630 width -= 2;
6631 if (width < 0)
6632 width = 0;
6633 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00006634 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635 if (width > len && !(flags & F_LJUST)) {
6636 do {
6637 --rescnt;
6638 *res++ = fill;
6639 } while (--width > len);
6640 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006641 if (fill == ' ') {
6642 if (sign)
6643 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00006644 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006645 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006646 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00006647 *res++ = *pbuf++;
6648 *res++ = *pbuf++;
6649 }
6650 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006651 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 res += len;
6653 rescnt -= len;
6654 while (--width >= len) {
6655 --rescnt;
6656 *res++ = ' ';
6657 }
6658 if (dict && (argidx < arglen) && c != '%') {
6659 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006660 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661 goto onError;
6662 }
6663 Py_XDECREF(temp);
6664 } /* '%' */
6665 } /* until end */
6666 if (argidx < arglen && !dict) {
6667 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006668 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669 goto onError;
6670 }
6671
6672 if (args_owned) {
6673 Py_DECREF(args);
6674 }
6675 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00006676 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006677 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678 return (PyObject *)result;
6679
6680 onError:
6681 Py_XDECREF(result);
6682 Py_DECREF(uformat);
6683 if (args_owned) {
6684 Py_DECREF(args);
6685 }
6686 return NULL;
6687}
6688
6689static PyBufferProcs unicode_as_buffer = {
6690 (getreadbufferproc) unicode_buffer_getreadbuf,
6691 (getwritebufferproc) unicode_buffer_getwritebuf,
6692 (getsegcountproc) unicode_buffer_getsegcount,
6693 (getcharbufferproc) unicode_buffer_getcharbuf,
6694};
6695
Jeremy Hylton938ace62002-07-17 16:30:39 +00006696static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00006697unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6698
Tim Peters6d6c1a32001-08-02 04:15:00 +00006699static PyObject *
6700unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6701{
6702 PyObject *x = NULL;
6703 static char *kwlist[] = {"string", "encoding", "errors", 0};
6704 char *encoding = NULL;
6705 char *errors = NULL;
6706
Guido van Rossume023fe02001-08-30 03:12:59 +00006707 if (type != &PyUnicode_Type)
6708 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00006709 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6710 kwlist, &x, &encoding, &errors))
6711 return NULL;
6712 if (x == NULL)
6713 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00006714 if (encoding == NULL && errors == NULL)
6715 return PyObject_Unicode(x);
6716 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00006717 return PyUnicode_FromEncodedObject(x, encoding, errors);
6718}
6719
Guido van Rossume023fe02001-08-30 03:12:59 +00006720static PyObject *
6721unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6722{
Tim Petersaf90b3e2001-09-12 05:18:58 +00006723 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006724 int n;
6725
6726 assert(PyType_IsSubtype(type, &PyUnicode_Type));
6727 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6728 if (tmp == NULL)
6729 return NULL;
6730 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00006731 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006732 if (pnew == NULL) {
6733 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00006734 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00006735 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006736 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6737 if (pnew->str == NULL) {
6738 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006739 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006740 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00006741 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00006742 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006743 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6744 pnew->length = n;
6745 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00006746 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00006747 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006748}
6749
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006750PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00006751"unicode(string [, encoding[, errors]]) -> object\n\
6752\n\
6753Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00006754encoding defaults to the current default string encoding.\n\
6755errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00006756
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757PyTypeObject PyUnicode_Type = {
6758 PyObject_HEAD_INIT(&PyType_Type)
6759 0, /* ob_size */
6760 "unicode", /* tp_name */
6761 sizeof(PyUnicodeObject), /* tp_size */
6762 0, /* tp_itemsize */
6763 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00006764 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006766 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 0, /* tp_setattr */
6768 (cmpfunc) unicode_compare, /* tp_compare */
6769 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006770 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006772 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 (hashfunc) unicode_hash, /* tp_hash*/
6774 0, /* tp_call*/
6775 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006776 PyObject_GenericGetAttr, /* tp_getattro */
6777 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006779 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
6780 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006781 unicode_doc, /* tp_doc */
6782 0, /* tp_traverse */
6783 0, /* tp_clear */
6784 0, /* tp_richcompare */
6785 0, /* tp_weaklistoffset */
6786 0, /* tp_iter */
6787 0, /* tp_iternext */
6788 unicode_methods, /* tp_methods */
6789 0, /* tp_members */
6790 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00006791 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006792 0, /* tp_dict */
6793 0, /* tp_descr_get */
6794 0, /* tp_descr_set */
6795 0, /* tp_dictoffset */
6796 0, /* tp_init */
6797 0, /* tp_alloc */
6798 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006799 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800};
6801
6802/* Initialize the Unicode implementation */
6803
Thomas Wouters78890102000-07-22 19:25:51 +00006804void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006806 int i;
6807
Fred Drakee4315f52000-05-09 19:53:39 +00006808 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006809 unicode_freelist = NULL;
6810 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00006812 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006813 for (i = 0; i < 256; i++)
6814 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00006815 if (PyType_Ready(&PyUnicode_Type) < 0)
6816 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817}
6818
6819/* Finalize the Unicode implementation */
6820
6821void
Thomas Wouters78890102000-07-22 19:25:51 +00006822_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006824 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006825 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00006827 Py_XDECREF(unicode_empty);
6828 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006829
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006830 for (i = 0; i < 256; i++) {
6831 if (unicode_latin1[i]) {
6832 Py_DECREF(unicode_latin1[i]);
6833 unicode_latin1[i] = NULL;
6834 }
6835 }
6836
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006837 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838 PyUnicodeObject *v = u;
6839 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006840 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00006841 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006842 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006843 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006845 unicode_freelist = NULL;
6846 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006848
6849/*
6850Local variables:
6851c-basic-offset: 4
6852indent-tabs-mode: nil
6853End:
6854*/