blob: e4426d4fe453b37c1e51fc07d9fb3013cc5dcf2a [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
157 unicode->length = length;
158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000266int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
371 for (i = size; i >= 0; i--)
372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
387 if (size > PyUnicode_GET_SIZE(unicode))
388 size = PyUnicode_GET_SIZE(unicode);
389#ifdef HAVE_USABLE_WCHAR_T
390 memcpy(w, unicode->str, size * sizeof(wchar_t));
391#else
392 {
393 register Py_UNICODE *u;
394 register int i;
395 u = PyUnicode_AS_UNICODE(unicode);
396 for (i = size; i >= 0; i--)
397 *w++ = *u++;
398 }
399#endif
400
401 return size;
402}
403
404#endif
405
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000406PyObject *PyUnicode_FromOrdinal(int ordinal)
407{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000408 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000409
410#ifdef Py_UNICODE_WIDE
411 if (ordinal < 0 || ordinal > 0x10ffff) {
412 PyErr_SetString(PyExc_ValueError,
413 "unichr() arg not in range(0x110000) "
414 "(wide Python build)");
415 return NULL;
416 }
417#else
418 if (ordinal < 0 || ordinal > 0xffff) {
419 PyErr_SetString(PyExc_ValueError,
420 "unichr() arg not in range(0x10000) "
421 "(narrow Python build)");
422 return NULL;
423 }
424#endif
425
Hye-Shik Chang40574832004-04-06 07:24:51 +0000426 s[0] = (Py_UNICODE)ordinal;
427 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000428}
429
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430PyObject *PyUnicode_FromObject(register PyObject *obj)
431{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000432 /* XXX Perhaps we should make this API an alias of
433 PyObject_Unicode() instead ?! */
434 if (PyUnicode_CheckExact(obj)) {
435 Py_INCREF(obj);
436 return obj;
437 }
438 if (PyUnicode_Check(obj)) {
439 /* For a Unicode subtype that's not a Unicode object,
440 return a true Unicode object with the same data. */
441 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
442 PyUnicode_GET_SIZE(obj));
443 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000444 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
445}
446
447PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
448 const char *encoding,
449 const char *errors)
450{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000451 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000453 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000454
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455 if (obj == NULL) {
456 PyErr_BadInternalCall();
457 return NULL;
458 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000460#if 0
461 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000462 that no encodings is given and then redirect to
463 PyObject_Unicode() which then applies the additional logic for
464 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000465
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000466 NOTE: This API should really only be used for object which
467 represent *encoded* Unicode !
468
469 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000470 if (PyUnicode_Check(obj)) {
471 if (encoding) {
472 PyErr_SetString(PyExc_TypeError,
473 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000474 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000475 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000476 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000477 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000478#else
479 if (PyUnicode_Check(obj)) {
480 PyErr_SetString(PyExc_TypeError,
481 "decoding Unicode is not supported");
482 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000484#endif
485
486 /* Coerce object */
487 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000488 s = PyString_AS_STRING(obj);
489 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000490 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
492 /* Overwrite the error message with something more useful in
493 case of a TypeError. */
494 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000495 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000496 "coercing to Unicode: need string or buffer, "
497 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000498 obj->ob_type->tp_name);
499 goto onError;
500 }
Tim Petersced69f82003-09-16 20:30:58 +0000501
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000502 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 if (len == 0) {
504 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000505 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000509
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000510 return v;
511
512 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514}
515
516PyObject *PyUnicode_Decode(const char *s,
517 int size,
518 const char *encoding,
519 const char *errors)
520{
521 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000522
523 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000524 encoding = PyUnicode_GetDefaultEncoding();
525
526 /* Shortcuts for common default encodings */
527 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000529 else if (strcmp(encoding, "latin-1") == 0)
530 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000531#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
532 else if (strcmp(encoding, "mbcs") == 0)
533 return PyUnicode_DecodeMBCS(s, size, errors);
534#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000535 else if (strcmp(encoding, "ascii") == 0)
536 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000537
538 /* Decode via the codec registry */
539 buffer = PyBuffer_FromMemory((void *)s, size);
540 if (buffer == NULL)
541 goto onError;
542 unicode = PyCodec_Decode(buffer, encoding, errors);
543 if (unicode == NULL)
544 goto onError;
545 if (!PyUnicode_Check(unicode)) {
546 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000547 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548 unicode->ob_type->tp_name);
549 Py_DECREF(unicode);
550 goto onError;
551 }
552 Py_DECREF(buffer);
553 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000554
Guido van Rossumd57fd912000-03-10 22:53:23 +0000555 onError:
556 Py_XDECREF(buffer);
557 return NULL;
558}
559
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000560PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
561 const char *encoding,
562 const char *errors)
563{
564 PyObject *v;
565
566 if (!PyUnicode_Check(unicode)) {
567 PyErr_BadArgument();
568 goto onError;
569 }
570
571 if (encoding == NULL)
572 encoding = PyUnicode_GetDefaultEncoding();
573
574 /* Decode via the codec registry */
575 v = PyCodec_Decode(unicode, encoding, errors);
576 if (v == NULL)
577 goto onError;
578 return v;
579
580 onError:
581 return NULL;
582}
583
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584PyObject *PyUnicode_Encode(const Py_UNICODE *s,
585 int size,
586 const char *encoding,
587 const char *errors)
588{
589 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000590
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591 unicode = PyUnicode_FromUnicode(s, size);
592 if (unicode == NULL)
593 return NULL;
594 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
595 Py_DECREF(unicode);
596 return v;
597}
598
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000599PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
600 const char *encoding,
601 const char *errors)
602{
603 PyObject *v;
604
605 if (!PyUnicode_Check(unicode)) {
606 PyErr_BadArgument();
607 goto onError;
608 }
609
610 if (encoding == NULL)
611 encoding = PyUnicode_GetDefaultEncoding();
612
613 /* Encode via the codec registry */
614 v = PyCodec_Encode(unicode, encoding, errors);
615 if (v == NULL)
616 goto onError;
617 return v;
618
619 onError:
620 return NULL;
621}
622
Guido van Rossumd57fd912000-03-10 22:53:23 +0000623PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
624 const char *encoding,
625 const char *errors)
626{
627 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629 if (!PyUnicode_Check(unicode)) {
630 PyErr_BadArgument();
631 goto onError;
632 }
Fred Drakee4315f52000-05-09 19:53:39 +0000633
Tim Petersced69f82003-09-16 20:30:58 +0000634 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000635 encoding = PyUnicode_GetDefaultEncoding();
636
637 /* Shortcuts for common default encodings */
638 if (errors == NULL) {
639 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000640 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000641 else if (strcmp(encoding, "latin-1") == 0)
642 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000643#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
644 else if (strcmp(encoding, "mbcs") == 0)
645 return PyUnicode_AsMBCSString(unicode);
646#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000647 else if (strcmp(encoding, "ascii") == 0)
648 return PyUnicode_AsASCIIString(unicode);
649 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650
651 /* Encode via the codec registry */
652 v = PyCodec_Encode(unicode, encoding, errors);
653 if (v == NULL)
654 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 if (!PyString_Check(v)) {
656 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000657 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 v->ob_type->tp_name);
659 Py_DECREF(v);
660 goto onError;
661 }
662 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000663
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 onError:
665 return NULL;
666}
667
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000668PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
669 const char *errors)
670{
671 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
672
673 if (v)
674 return v;
675 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
676 if (v && errors == NULL)
677 ((PyUnicodeObject *)unicode)->defenc = v;
678 return v;
679}
680
Guido van Rossumd57fd912000-03-10 22:53:23 +0000681Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
682{
683 if (!PyUnicode_Check(unicode)) {
684 PyErr_BadArgument();
685 goto onError;
686 }
687 return PyUnicode_AS_UNICODE(unicode);
688
689 onError:
690 return NULL;
691}
692
693int PyUnicode_GetSize(PyObject *unicode)
694{
695 if (!PyUnicode_Check(unicode)) {
696 PyErr_BadArgument();
697 goto onError;
698 }
699 return PyUnicode_GET_SIZE(unicode);
700
701 onError:
702 return -1;
703}
704
Thomas Wouters78890102000-07-22 19:25:51 +0000705const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000706{
707 return unicode_default_encoding;
708}
709
710int PyUnicode_SetDefaultEncoding(const char *encoding)
711{
712 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000713
Fred Drakee4315f52000-05-09 19:53:39 +0000714 /* Make sure the encoding is valid. As side effect, this also
715 loads the encoding into the codec registry cache. */
716 v = _PyCodec_Lookup(encoding);
717 if (v == NULL)
718 goto onError;
719 Py_DECREF(v);
720 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000721 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000722 sizeof(unicode_default_encoding));
723 return 0;
724
725 onError:
726 return -1;
727}
728
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000729/* error handling callback helper:
730 build arguments, call the callback and check the arguments,
731 if no exception occured, copy the replacement to the output
732 and adjust various state variables.
733 return 0 on success, -1 on error
734*/
735
736static
737int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
738 const char *encoding, const char *reason,
739 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
740 PyObject **output, int *outpos, Py_UNICODE **outptr)
741{
742 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
743
744 PyObject *restuple = NULL;
745 PyObject *repunicode = NULL;
746 int outsize = PyUnicode_GET_SIZE(*output);
747 int requiredsize;
748 int newpos;
749 Py_UNICODE *repptr;
750 int repsize;
751 int res = -1;
752
753 if (*errorHandler == NULL) {
754 *errorHandler = PyCodec_LookupError(errors);
755 if (*errorHandler == NULL)
756 goto onError;
757 }
758
759 if (*exceptionObject == NULL) {
760 *exceptionObject = PyUnicodeDecodeError_Create(
761 encoding, input, insize, *startinpos, *endinpos, reason);
762 if (*exceptionObject == NULL)
763 goto onError;
764 }
765 else {
766 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
767 goto onError;
768 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
769 goto onError;
770 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
771 goto onError;
772 }
773
774 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
775 if (restuple == NULL)
776 goto onError;
777 if (!PyTuple_Check(restuple)) {
778 PyErr_Format(PyExc_TypeError, &argparse[4]);
779 goto onError;
780 }
781 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
782 goto onError;
783 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000784 newpos = insize+newpos;
785 if (newpos<0 || newpos>insize) {
786 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
787 goto onError;
788 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000789
790 /* need more space? (at least enough for what we
791 have+the replacement+the rest of the string (starting
792 at the new input position), so we won't have to check space
793 when there are no errors in the rest of the string) */
794 repptr = PyUnicode_AS_UNICODE(repunicode);
795 repsize = PyUnicode_GET_SIZE(repunicode);
796 requiredsize = *outpos + repsize + insize-newpos;
797 if (requiredsize > outsize) {
798 if (requiredsize<2*outsize)
799 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000800 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000801 goto onError;
802 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
803 }
804 *endinpos = newpos;
805 *inptr = input + newpos;
806 Py_UNICODE_COPY(*outptr, repptr, repsize);
807 *outptr += repsize;
808 *outpos += repsize;
809 /* we made it! */
810 res = 0;
811
812 onError:
813 Py_XDECREF(restuple);
814 return res;
815}
816
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000817/* --- UTF-7 Codec -------------------------------------------------------- */
818
819/* see RFC2152 for details */
820
Tim Petersced69f82003-09-16 20:30:58 +0000821static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000822char utf7_special[128] = {
823 /* indicate whether a UTF-7 character is special i.e. cannot be directly
824 encoded:
825 0 - not special
826 1 - special
827 2 - whitespace (optional)
828 3 - RFC2152 Set O (optional) */
829 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
830 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
831 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
832 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
833 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
834 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
835 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
836 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
837
838};
839
840#define SPECIAL(c, encodeO, encodeWS) \
841 (((c)>127 || utf7_special[(c)] == 1) || \
842 (encodeWS && (utf7_special[(c)] == 2)) || \
843 (encodeO && (utf7_special[(c)] == 3)))
844
845#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
846#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
847#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
848 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
849
850#define ENCODE(out, ch, bits) \
851 while (bits >= 6) { \
852 *out++ = B64(ch >> (bits-6)); \
853 bits -= 6; \
854 }
855
856#define DECODE(out, ch, bits, surrogate) \
857 while (bits >= 16) { \
858 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
859 bits -= 16; \
860 if (surrogate) { \
861 /* We have already generated an error for the high surrogate
862 so let's not bother seeing if the low surrogate is correct or not */\
863 surrogate = 0; \
864 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
865 /* This is a surrogate pair. Unfortunately we can't represent \
866 it in a 16-bit character */ \
867 surrogate = 1; \
868 errmsg = "code pairs are not supported"; \
869 goto utf7Error; \
870 } else { \
871 *out++ = outCh; \
872 } \
873 } \
874
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000875PyObject *PyUnicode_DecodeUTF7(const char *s,
876 int size,
877 const char *errors)
878{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000879 const char *starts = s;
880 int startinpos;
881 int endinpos;
882 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000883 const char *e;
884 PyUnicodeObject *unicode;
885 Py_UNICODE *p;
886 const char *errmsg = "";
887 int inShift = 0;
888 unsigned int bitsleft = 0;
889 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000890 int surrogate = 0;
891 PyObject *errorHandler = NULL;
892 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000893
894 unicode = _PyUnicode_New(size);
895 if (!unicode)
896 return NULL;
897 if (size == 0)
898 return (PyObject *)unicode;
899
900 p = unicode->str;
901 e = s + size;
902
903 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000904 Py_UNICODE ch;
905 restart:
906 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907
908 if (inShift) {
909 if ((ch == '-') || !B64CHAR(ch)) {
910 inShift = 0;
911 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000912
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000913 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
914 if (bitsleft >= 6) {
915 /* The shift sequence has a partial character in it. If
916 bitsleft < 6 then we could just classify it as padding
917 but that is not the case here */
918
919 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000920 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921 }
922 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000923 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000924 here so indicate the potential of a misencoded character. */
925
926 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
927 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
928 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000929 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000930 }
931
932 if (ch == '-') {
933 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000934 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000935 inShift = 1;
936 }
937 } else if (SPECIAL(ch,0,0)) {
938 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000939 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000940 } else {
941 *p++ = ch;
942 }
943 } else {
944 charsleft = (charsleft << 6) | UB64(ch);
945 bitsleft += 6;
946 s++;
947 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
948 }
949 }
950 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000951 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000952 s++;
953 if (s < e && *s == '-') {
954 s++;
955 *p++ = '+';
956 } else
957 {
958 inShift = 1;
959 bitsleft = 0;
960 }
961 }
962 else if (SPECIAL(ch,0,0)) {
963 errmsg = "unexpected special character";
964 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000965 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000966 }
967 else {
968 *p++ = ch;
969 s++;
970 }
971 continue;
972 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000973 outpos = p-PyUnicode_AS_UNICODE(unicode);
974 endinpos = s-starts;
975 if (unicode_decode_call_errorhandler(
976 errors, &errorHandler,
977 "utf7", errmsg,
978 starts, size, &startinpos, &endinpos, &exc, &s,
979 (PyObject **)&unicode, &outpos, &p))
980 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000981 }
982
983 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000984 outpos = p-PyUnicode_AS_UNICODE(unicode);
985 endinpos = size;
986 if (unicode_decode_call_errorhandler(
987 errors, &errorHandler,
988 "utf7", "unterminated shift sequence",
989 starts, size, &startinpos, &endinpos, &exc, &s,
990 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000991 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000992 if (s < e)
993 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000994 }
995
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000996 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 goto onError;
998
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000999 Py_XDECREF(errorHandler);
1000 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001001 return (PyObject *)unicode;
1002
1003onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001004 Py_XDECREF(errorHandler);
1005 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001006 Py_DECREF(unicode);
1007 return NULL;
1008}
1009
1010
1011PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1012 int size,
1013 int encodeSetO,
1014 int encodeWhiteSpace,
1015 const char *errors)
1016{
1017 PyObject *v;
1018 /* It might be possible to tighten this worst case */
1019 unsigned int cbAllocated = 5 * size;
1020 int inShift = 0;
1021 int i = 0;
1022 unsigned int bitsleft = 0;
1023 unsigned long charsleft = 0;
1024 char * out;
1025 char * start;
1026
1027 if (size == 0)
1028 return PyString_FromStringAndSize(NULL, 0);
1029
1030 v = PyString_FromStringAndSize(NULL, cbAllocated);
1031 if (v == NULL)
1032 return NULL;
1033
1034 start = out = PyString_AS_STRING(v);
1035 for (;i < size; ++i) {
1036 Py_UNICODE ch = s[i];
1037
1038 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001039 if (ch == '+') {
1040 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001041 *out++ = '-';
1042 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1043 charsleft = ch;
1044 bitsleft = 16;
1045 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001046 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001047 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001048 } else {
1049 *out++ = (char) ch;
1050 }
1051 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001052 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1053 *out++ = B64(charsleft << (6-bitsleft));
1054 charsleft = 0;
1055 bitsleft = 0;
1056 /* Characters not in the BASE64 set implicitly unshift the sequence
1057 so no '-' is required, except if the character is itself a '-' */
1058 if (B64CHAR(ch) || ch == '-') {
1059 *out++ = '-';
1060 }
1061 inShift = 0;
1062 *out++ = (char) ch;
1063 } else {
1064 bitsleft += 16;
1065 charsleft = (charsleft << 16) | ch;
1066 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1067
1068 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001069 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001070 or '-' then the shift sequence will be terminated implicitly and we
1071 don't have to insert a '-'. */
1072
1073 if (bitsleft == 0) {
1074 if (i + 1 < size) {
1075 Py_UNICODE ch2 = s[i+1];
1076
1077 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001078
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001079 } else if (B64CHAR(ch2) || ch2 == '-') {
1080 *out++ = '-';
1081 inShift = 0;
1082 } else {
1083 inShift = 0;
1084 }
1085
1086 }
1087 else {
1088 *out++ = '-';
1089 inShift = 0;
1090 }
1091 }
Tim Petersced69f82003-09-16 20:30:58 +00001092 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001093 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001094 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001095 if (bitsleft) {
1096 *out++= B64(charsleft << (6-bitsleft) );
1097 *out++ = '-';
1098 }
1099
Tim Peters5de98422002-04-27 18:44:32 +00001100 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001101 return v;
1102}
1103
1104#undef SPECIAL
1105#undef B64
1106#undef B64CHAR
1107#undef UB64
1108#undef ENCODE
1109#undef DECODE
1110
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111/* --- UTF-8 Codec -------------------------------------------------------- */
1112
Tim Petersced69f82003-09-16 20:30:58 +00001113static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114char utf8_code_length[256] = {
1115 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1116 illegal prefix. see RFC 2279 for details */
1117 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1118 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1119 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1120 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1121 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1122 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1124 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1126 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1129 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1130 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1131 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1132 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1133};
1134
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135PyObject *PyUnicode_DecodeUTF8(const char *s,
1136 int size,
1137 const char *errors)
1138{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001139 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001141 int startinpos;
1142 int endinpos;
1143 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144 const char *e;
1145 PyUnicodeObject *unicode;
1146 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001147 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001148 PyObject *errorHandler = NULL;
1149 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150
1151 /* Note: size will always be longer than the resulting Unicode
1152 character count */
1153 unicode = _PyUnicode_New(size);
1154 if (!unicode)
1155 return NULL;
1156 if (size == 0)
1157 return (PyObject *)unicode;
1158
1159 /* Unpack UTF-8 encoded data */
1160 p = unicode->str;
1161 e = s + size;
1162
1163 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001164 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165
1166 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001167 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 s++;
1169 continue;
1170 }
1171
1172 n = utf8_code_length[ch];
1173
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001174 if (s + n > e) {
1175 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001176 startinpos = s-starts;
1177 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001178 goto utf8Error;
1179 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180
1181 switch (n) {
1182
1183 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001184 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001185 startinpos = s-starts;
1186 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001187 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188
1189 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001190 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001191 startinpos = s-starts;
1192 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001193 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194
1195 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001196 if ((s[1] & 0xc0) != 0x80) {
1197 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001198 startinpos = s-starts;
1199 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001200 goto utf8Error;
1201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001203 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001204 startinpos = s-starts;
1205 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001206 errmsg = "illegal encoding";
1207 goto utf8Error;
1208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001210 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 break;
1212
1213 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001214 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001215 (s[2] & 0xc0) != 0x80) {
1216 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001217 startinpos = s-starts;
1218 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001219 goto utf8Error;
1220 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001222 if (ch < 0x0800) {
1223 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001224 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001225
1226 XXX For wide builds (UCS-4) we should probably try
1227 to recombine the surrogates into a single code
1228 unit.
1229 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001230 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001231 startinpos = s-starts;
1232 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001233 goto utf8Error;
1234 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001236 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001237 break;
1238
1239 case 4:
1240 if ((s[1] & 0xc0) != 0x80 ||
1241 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001242 (s[3] & 0xc0) != 0x80) {
1243 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001244 startinpos = s-starts;
1245 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001246 goto utf8Error;
1247 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001248 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1249 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1250 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001251 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001252 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001253 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001254 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001255 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001256 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001257 startinpos = s-starts;
1258 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001259 goto utf8Error;
1260 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001261#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001262 *p++ = (Py_UNICODE)ch;
1263#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001264 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001265
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001266 /* translate from 10000..10FFFF to 0..FFFF */
1267 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001268
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001269 /* high surrogate = top 10 bits added to D800 */
1270 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001271
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001272 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001273 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001274#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275 break;
1276
1277 default:
1278 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001279 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 }
1284 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001286
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001287 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001288 outpos = p-PyUnicode_AS_UNICODE(unicode);
1289 if (unicode_decode_call_errorhandler(
1290 errors, &errorHandler,
1291 "utf8", errmsg,
1292 starts, size, &startinpos, &endinpos, &exc, &s,
1293 (PyObject **)&unicode, &outpos, &p))
1294 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001295 }
1296
1297 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001298 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001299 goto onError;
1300
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001301 Py_XDECREF(errorHandler);
1302 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 return (PyObject *)unicode;
1304
1305onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001306 Py_XDECREF(errorHandler);
1307 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 Py_DECREF(unicode);
1309 return NULL;
1310}
1311
Tim Peters602f7402002-04-27 18:03:26 +00001312/* Allocation strategy: if the string is short, convert into a stack buffer
1313 and allocate exactly as much space needed at the end. Else allocate the
1314 maximum possible needed (4 result bytes per Unicode character), and return
1315 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001316*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001317PyObject *
1318PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1319 int size,
1320 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321{
Tim Peters602f7402002-04-27 18:03:26 +00001322#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001323
Tim Peters602f7402002-04-27 18:03:26 +00001324 int i; /* index into s of next input byte */
1325 PyObject *v; /* result string object */
1326 char *p; /* next free byte in output buffer */
1327 int nallocated; /* number of result bytes allocated */
1328 int nneeded; /* number of result bytes needed */
1329 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001330
Tim Peters602f7402002-04-27 18:03:26 +00001331 assert(s != NULL);
1332 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001333
Tim Peters602f7402002-04-27 18:03:26 +00001334 if (size <= MAX_SHORT_UNICHARS) {
1335 /* Write into the stack buffer; nallocated can't overflow.
1336 * At the end, we'll allocate exactly as much heap space as it
1337 * turns out we need.
1338 */
1339 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1340 v = NULL; /* will allocate after we're done */
1341 p = stackbuf;
1342 }
1343 else {
1344 /* Overallocate on the heap, and give the excess back at the end. */
1345 nallocated = size * 4;
1346 if (nallocated / 4 != size) /* overflow! */
1347 return PyErr_NoMemory();
1348 v = PyString_FromStringAndSize(NULL, nallocated);
1349 if (v == NULL)
1350 return NULL;
1351 p = PyString_AS_STRING(v);
1352 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001353
Tim Peters602f7402002-04-27 18:03:26 +00001354 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001355 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001356
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001357 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001358 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001360
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001362 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001363 *p++ = (char)(0xc0 | (ch >> 6));
1364 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001365 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001366 else {
Tim Peters602f7402002-04-27 18:03:26 +00001367 /* Encode UCS2 Unicode ordinals */
1368 if (ch < 0x10000) {
1369 /* Special case: check for high surrogate */
1370 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1371 Py_UCS4 ch2 = s[i];
1372 /* Check for low surrogate and combine the two to
1373 form a UCS4 value */
1374 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001375 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001376 i++;
1377 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001378 }
Tim Peters602f7402002-04-27 18:03:26 +00001379 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001380 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001381 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001382 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1383 *p++ = (char)(0x80 | (ch & 0x3f));
1384 continue;
1385 }
1386encodeUCS4:
1387 /* Encode UCS4 Unicode ordinals */
1388 *p++ = (char)(0xf0 | (ch >> 18));
1389 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1390 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1391 *p++ = (char)(0x80 | (ch & 0x3f));
1392 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001394
Tim Peters602f7402002-04-27 18:03:26 +00001395 if (v == NULL) {
1396 /* This was stack allocated. */
1397 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1398 assert(nneeded <= nallocated);
1399 v = PyString_FromStringAndSize(stackbuf, nneeded);
1400 }
1401 else {
1402 /* Cut back to size actually needed. */
1403 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1404 assert(nneeded <= nallocated);
1405 _PyString_Resize(&v, nneeded);
1406 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001407 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001408
Tim Peters602f7402002-04-27 18:03:26 +00001409#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410}
1411
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1413{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001414 if (!PyUnicode_Check(unicode)) {
1415 PyErr_BadArgument();
1416 return NULL;
1417 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001418 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1419 PyUnicode_GET_SIZE(unicode),
1420 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421}
1422
1423/* --- UTF-16 Codec ------------------------------------------------------- */
1424
Tim Peters772747b2001-08-09 22:21:55 +00001425PyObject *
1426PyUnicode_DecodeUTF16(const char *s,
1427 int size,
1428 const char *errors,
1429 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001431 const char *starts = s;
1432 int startinpos;
1433 int endinpos;
1434 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435 PyUnicodeObject *unicode;
1436 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001437 const unsigned char *q, *e;
1438 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001439 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001440 /* Offsets from q for retrieving byte pairs in the right order. */
1441#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1442 int ihi = 1, ilo = 0;
1443#else
1444 int ihi = 0, ilo = 1;
1445#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001446 PyObject *errorHandler = NULL;
1447 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001448
1449 /* Note: size will always be longer than the resulting Unicode
1450 character count */
1451 unicode = _PyUnicode_New(size);
1452 if (!unicode)
1453 return NULL;
1454 if (size == 0)
1455 return (PyObject *)unicode;
1456
1457 /* Unpack UTF-16 encoded data */
1458 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001459 q = (unsigned char *)s;
1460 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461
1462 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001463 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001464
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001465 /* Check for BOM marks (U+FEFF) in the input and adjust current
1466 byte order setting accordingly. In native mode, the leading BOM
1467 mark is skipped, in all other modes, it is copied to the output
1468 stream as-is (giving a ZWNBSP character). */
1469 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001470 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001471#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001472 if (bom == 0xFEFF) {
1473 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001474 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001475 }
1476 else if (bom == 0xFFFE) {
1477 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001478 bo = 1;
1479 }
Tim Petersced69f82003-09-16 20:30:58 +00001480#else
Tim Peters772747b2001-08-09 22:21:55 +00001481 if (bom == 0xFEFF) {
1482 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001483 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001484 }
1485 else if (bom == 0xFFFE) {
1486 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001487 bo = -1;
1488 }
1489#endif
1490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491
Tim Peters772747b2001-08-09 22:21:55 +00001492 if (bo == -1) {
1493 /* force LE */
1494 ihi = 1;
1495 ilo = 0;
1496 }
1497 else if (bo == 1) {
1498 /* force BE */
1499 ihi = 0;
1500 ilo = 1;
1501 }
1502
1503 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001504 Py_UNICODE ch;
1505 /* remaing bytes at the end? (size should be even) */
1506 if (e-q<2) {
1507 errmsg = "truncated data";
1508 startinpos = ((const char *)q)-starts;
1509 endinpos = ((const char *)e)-starts;
1510 goto utf16Error;
1511 /* The remaining input chars are ignored if the callback
1512 chooses to skip the input */
1513 }
1514 ch = (q[ihi] << 8) | q[ilo];
1515
Tim Peters772747b2001-08-09 22:21:55 +00001516 q += 2;
1517
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518 if (ch < 0xD800 || ch > 0xDFFF) {
1519 *p++ = ch;
1520 continue;
1521 }
1522
1523 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001524 if (q >= e) {
1525 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001526 startinpos = (((const char *)q)-2)-starts;
1527 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001528 goto utf16Error;
1529 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001530 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001531 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1532 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001533 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001534#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001535 *p++ = ch;
1536 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001537#else
1538 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001539#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001540 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001541 }
1542 else {
1543 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001544 startinpos = (((const char *)q)-4)-starts;
1545 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001546 goto utf16Error;
1547 }
1548
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001550 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001551 startinpos = (((const char *)q)-2)-starts;
1552 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001553 /* Fall through to report the error */
1554
1555 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001556 outpos = p-PyUnicode_AS_UNICODE(unicode);
1557 if (unicode_decode_call_errorhandler(
1558 errors, &errorHandler,
1559 "utf16", errmsg,
1560 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1561 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001562 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563 }
1564
1565 if (byteorder)
1566 *byteorder = bo;
1567
1568 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001569 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001570 goto onError;
1571
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001572 Py_XDECREF(errorHandler);
1573 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001574 return (PyObject *)unicode;
1575
1576onError:
1577 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001578 Py_XDECREF(errorHandler);
1579 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001580 return NULL;
1581}
1582
Tim Peters772747b2001-08-09 22:21:55 +00001583PyObject *
1584PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1585 int size,
1586 const char *errors,
1587 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001588{
1589 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001590 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001591#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001592 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001593#else
1594 const int pairs = 0;
1595#endif
Tim Peters772747b2001-08-09 22:21:55 +00001596 /* Offsets from p for storing byte pairs in the right order. */
1597#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1598 int ihi = 1, ilo = 0;
1599#else
1600 int ihi = 0, ilo = 1;
1601#endif
1602
1603#define STORECHAR(CH) \
1604 do { \
1605 p[ihi] = ((CH) >> 8) & 0xff; \
1606 p[ilo] = (CH) & 0xff; \
1607 p += 2; \
1608 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001609
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001610#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001611 for (i = pairs = 0; i < size; i++)
1612 if (s[i] >= 0x10000)
1613 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001614#endif
Tim Petersced69f82003-09-16 20:30:58 +00001615 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001616 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001617 if (v == NULL)
1618 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619
Tim Peters772747b2001-08-09 22:21:55 +00001620 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001621 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001622 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001623 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001624 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001625
1626 if (byteorder == -1) {
1627 /* force LE */
1628 ihi = 1;
1629 ilo = 0;
1630 }
1631 else if (byteorder == 1) {
1632 /* force BE */
1633 ihi = 0;
1634 ilo = 1;
1635 }
1636
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001637 while (size-- > 0) {
1638 Py_UNICODE ch = *s++;
1639 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001640#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001641 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001642 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1643 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001644 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001645#endif
Tim Peters772747b2001-08-09 22:21:55 +00001646 STORECHAR(ch);
1647 if (ch2)
1648 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001649 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001650 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001651#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652}
1653
1654PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1655{
1656 if (!PyUnicode_Check(unicode)) {
1657 PyErr_BadArgument();
1658 return NULL;
1659 }
1660 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1661 PyUnicode_GET_SIZE(unicode),
1662 NULL,
1663 0);
1664}
1665
1666/* --- Unicode Escape Codec ----------------------------------------------- */
1667
Fredrik Lundh06d12682001-01-24 07:59:11 +00001668static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001669
Guido van Rossumd57fd912000-03-10 22:53:23 +00001670PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1671 int size,
1672 const char *errors)
1673{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001674 const char *starts = s;
1675 int startinpos;
1676 int endinpos;
1677 int outpos;
1678 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001680 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001682 char* message;
1683 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001684 PyObject *errorHandler = NULL;
1685 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001686
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687 /* Escaped strings will always be longer than the resulting
1688 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001689 length after conversion to the true value.
1690 (but if the error callback returns a long replacement string
1691 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 v = _PyUnicode_New(size);
1693 if (v == NULL)
1694 goto onError;
1695 if (size == 0)
1696 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001697
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001698 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001699 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001700
Guido van Rossumd57fd912000-03-10 22:53:23 +00001701 while (s < end) {
1702 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001703 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001704 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001705
1706 /* Non-escape characters are interpreted as Unicode ordinals */
1707 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001708 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001709 continue;
1710 }
1711
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001712 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713 /* \ - Escapes */
1714 s++;
1715 switch (*s++) {
1716
1717 /* \x escapes */
1718 case '\n': break;
1719 case '\\': *p++ = '\\'; break;
1720 case '\'': *p++ = '\''; break;
1721 case '\"': *p++ = '\"'; break;
1722 case 'b': *p++ = '\b'; break;
1723 case 'f': *p++ = '\014'; break; /* FF */
1724 case 't': *p++ = '\t'; break;
1725 case 'n': *p++ = '\n'; break;
1726 case 'r': *p++ = '\r'; break;
1727 case 'v': *p++ = '\013'; break; /* VT */
1728 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1729
1730 /* \OOO (octal) escapes */
1731 case '0': case '1': case '2': case '3':
1732 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001733 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001735 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001737 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001739 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001740 break;
1741
Fredrik Lundhccc74732001-02-18 22:13:49 +00001742 /* hex escapes */
1743 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001744 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001745 digits = 2;
1746 message = "truncated \\xXX escape";
1747 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748
Fredrik Lundhccc74732001-02-18 22:13:49 +00001749 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001751 digits = 4;
1752 message = "truncated \\uXXXX escape";
1753 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754
Fredrik Lundhccc74732001-02-18 22:13:49 +00001755 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001756 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001757 digits = 8;
1758 message = "truncated \\UXXXXXXXX escape";
1759 hexescape:
1760 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001761 outpos = p-PyUnicode_AS_UNICODE(v);
1762 if (s+digits>end) {
1763 endinpos = size;
1764 if (unicode_decode_call_errorhandler(
1765 errors, &errorHandler,
1766 "unicodeescape", "end of string in escape sequence",
1767 starts, size, &startinpos, &endinpos, &exc, &s,
1768 (PyObject **)&v, &outpos, &p))
1769 goto onError;
1770 goto nextByte;
1771 }
1772 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001773 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001774 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 endinpos = (s+i+1)-starts;
1776 if (unicode_decode_call_errorhandler(
1777 errors, &errorHandler,
1778 "unicodeescape", message,
1779 starts, size, &startinpos, &endinpos, &exc, &s,
1780 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001781 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001782 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001783 }
1784 chr = (chr<<4) & ~0xF;
1785 if (c >= '0' && c <= '9')
1786 chr += c - '0';
1787 else if (c >= 'a' && c <= 'f')
1788 chr += 10 + c - 'a';
1789 else
1790 chr += 10 + c - 'A';
1791 }
1792 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001793 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001794 /* _decoding_error will have already written into the
1795 target buffer. */
1796 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001797 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001798 /* when we get here, chr is a 32-bit unicode character */
1799 if (chr <= 0xffff)
1800 /* UCS-2 character */
1801 *p++ = (Py_UNICODE) chr;
1802 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001803 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001804 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001805#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001806 *p++ = chr;
1807#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001808 chr -= 0x10000L;
1809 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001810 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001811#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001812 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001813 endinpos = s-starts;
1814 outpos = p-PyUnicode_AS_UNICODE(v);
1815 if (unicode_decode_call_errorhandler(
1816 errors, &errorHandler,
1817 "unicodeescape", "illegal Unicode character",
1818 starts, size, &startinpos, &endinpos, &exc, &s,
1819 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001820 goto onError;
1821 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001822 break;
1823
1824 /* \N{name} */
1825 case 'N':
1826 message = "malformed \\N character escape";
1827 if (ucnhash_CAPI == NULL) {
1828 /* load the unicode data module */
1829 PyObject *m, *v;
1830 m = PyImport_ImportModule("unicodedata");
1831 if (m == NULL)
1832 goto ucnhashError;
1833 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1834 Py_DECREF(m);
1835 if (v == NULL)
1836 goto ucnhashError;
1837 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1838 Py_DECREF(v);
1839 if (ucnhash_CAPI == NULL)
1840 goto ucnhashError;
1841 }
1842 if (*s == '{') {
1843 const char *start = s+1;
1844 /* look for the closing brace */
1845 while (*s != '}' && s < end)
1846 s++;
1847 if (s > start && s < end && *s == '}') {
1848 /* found a name. look it up in the unicode database */
1849 message = "unknown Unicode character name";
1850 s++;
1851 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1852 goto store;
1853 }
1854 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001855 endinpos = s-starts;
1856 outpos = p-PyUnicode_AS_UNICODE(v);
1857 if (unicode_decode_call_errorhandler(
1858 errors, &errorHandler,
1859 "unicodeescape", message,
1860 starts, size, &startinpos, &endinpos, &exc, &s,
1861 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001862 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001863 break;
1864
1865 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001866 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 message = "\\ at end of string";
1868 s--;
1869 endinpos = s-starts;
1870 outpos = p-PyUnicode_AS_UNICODE(v);
1871 if (unicode_decode_call_errorhandler(
1872 errors, &errorHandler,
1873 "unicodeescape", message,
1874 starts, size, &startinpos, &endinpos, &exc, &s,
1875 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001876 goto onError;
1877 }
1878 else {
1879 *p++ = '\\';
1880 *p++ = (unsigned char)s[-1];
1881 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001882 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001884 nextByte:
1885 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001887 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001888 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001889 Py_XDECREF(errorHandler);
1890 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001891 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001892
Fredrik Lundhccc74732001-02-18 22:13:49 +00001893ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001894 PyErr_SetString(
1895 PyExc_UnicodeError,
1896 "\\N escapes not supported (can't load unicodedata module)"
1897 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001898 Py_XDECREF(errorHandler);
1899 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001900 return NULL;
1901
Fredrik Lundhccc74732001-02-18 22:13:49 +00001902onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001903 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001904 Py_XDECREF(errorHandler);
1905 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001906 return NULL;
1907}
1908
1909/* Return a Unicode-Escape string version of the Unicode object.
1910
1911 If quotes is true, the string is enclosed in u"" or u'' quotes as
1912 appropriate.
1913
1914*/
1915
Barry Warsaw51ac5802000-03-20 16:36:48 +00001916static const Py_UNICODE *findchar(const Py_UNICODE *s,
1917 int size,
1918 Py_UNICODE ch);
1919
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920static
1921PyObject *unicodeescape_string(const Py_UNICODE *s,
1922 int size,
1923 int quotes)
1924{
1925 PyObject *repr;
1926 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001928 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929
1930 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1931 if (repr == NULL)
1932 return NULL;
1933
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001934 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935
1936 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001938 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 !findchar(s, size, '"')) ? '"' : '\'';
1940 }
1941 while (size-- > 0) {
1942 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001943
Guido van Rossumd57fd912000-03-10 22:53:23 +00001944 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001945 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001946 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 *p++ = '\\';
1948 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001949 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001950 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001951
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001952#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001953 /* Map 21-bit characters to '\U00xxxxxx' */
1954 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001955 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00001956
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001957 /* Resize the string if necessary */
1958 if (offset + 12 > PyString_GET_SIZE(repr)) {
1959 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001960 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001961 p = PyString_AS_STRING(repr) + offset;
1962 }
1963
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001964 *p++ = '\\';
1965 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001966 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1967 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1968 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1969 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1970 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1971 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1972 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001973 *p++ = hexdigit[ch & 0x0000000F];
1974 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001975 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001976#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001977 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1978 else if (ch >= 0xD800 && ch < 0xDC00) {
1979 Py_UNICODE ch2;
1980 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00001981
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001982 ch2 = *s++;
1983 size--;
1984 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1985 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1986 *p++ = '\\';
1987 *p++ = 'U';
1988 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1989 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1990 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1991 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1992 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1993 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1994 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1995 *p++ = hexdigit[ucs & 0x0000000F];
1996 continue;
1997 }
1998 /* Fall through: isolated surrogates are copied as-is */
1999 s--;
2000 size++;
2001 }
2002
Guido van Rossumd57fd912000-03-10 22:53:23 +00002003 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002004 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 *p++ = '\\';
2006 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002007 *p++ = hexdigit[(ch >> 12) & 0x000F];
2008 *p++ = hexdigit[(ch >> 8) & 0x000F];
2009 *p++ = hexdigit[(ch >> 4) & 0x000F];
2010 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002012
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002013 /* Map special whitespace to '\t', \n', '\r' */
2014 else if (ch == '\t') {
2015 *p++ = '\\';
2016 *p++ = 't';
2017 }
2018 else if (ch == '\n') {
2019 *p++ = '\\';
2020 *p++ = 'n';
2021 }
2022 else if (ch == '\r') {
2023 *p++ = '\\';
2024 *p++ = 'r';
2025 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002026
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002027 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002028 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002029 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002030 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002031 *p++ = hexdigit[(ch >> 4) & 0x000F];
2032 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002033 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002034
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 /* Copy everything else as-is */
2036 else
2037 *p++ = (char) ch;
2038 }
2039 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002040 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041
2042 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002043 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 return repr;
2045}
2046
2047PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2048 int size)
2049{
2050 return unicodeescape_string(s, size, 0);
2051}
2052
2053PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2054{
2055 if (!PyUnicode_Check(unicode)) {
2056 PyErr_BadArgument();
2057 return NULL;
2058 }
2059 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2060 PyUnicode_GET_SIZE(unicode));
2061}
2062
2063/* --- Raw Unicode Escape Codec ------------------------------------------- */
2064
2065PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2066 int size,
2067 const char *errors)
2068{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002069 const char *starts = s;
2070 int startinpos;
2071 int endinpos;
2072 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002074 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075 const char *end;
2076 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002077 PyObject *errorHandler = NULL;
2078 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002079
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080 /* Escaped strings will always be longer than the resulting
2081 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002082 length after conversion to the true value. (But decoding error
2083 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084 v = _PyUnicode_New(size);
2085 if (v == NULL)
2086 goto onError;
2087 if (size == 0)
2088 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002089 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002090 end = s + size;
2091 while (s < end) {
2092 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002093 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002095 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096
2097 /* Non-escape characters are interpreted as Unicode ordinals */
2098 if (*s != '\\') {
2099 *p++ = (unsigned char)*s++;
2100 continue;
2101 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002102 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002103
2104 /* \u-escapes are only interpreted iff the number of leading
2105 backslashes if odd */
2106 bs = s;
2107 for (;s < end;) {
2108 if (*s != '\\')
2109 break;
2110 *p++ = (unsigned char)*s++;
2111 }
2112 if (((s - bs) & 1) == 0 ||
2113 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002114 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002115 continue;
2116 }
2117 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002118 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002119 s++;
2120
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002121 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002122 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002123 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002124 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002126 endinpos = s-starts;
2127 if (unicode_decode_call_errorhandler(
2128 errors, &errorHandler,
2129 "rawunicodeescape", "truncated \\uXXXX",
2130 starts, size, &startinpos, &endinpos, &exc, &s,
2131 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002132 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002133 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134 }
2135 x = (x<<4) & ~0xF;
2136 if (c >= '0' && c <= '9')
2137 x += c - '0';
2138 else if (c >= 'a' && c <= 'f')
2139 x += 10 + c - 'a';
2140 else
2141 x += 10 + c - 'A';
2142 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002143#ifndef Py_UNICODE_WIDE
2144 if (x > 0x10000) {
2145 if (unicode_decode_call_errorhandler(
2146 errors, &errorHandler,
2147 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2148 starts, size, &startinpos, &endinpos, &exc, &s,
2149 (PyObject **)&v, &outpos, &p))
2150 goto onError;
2151 }
2152#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002153 *p++ = x;
2154 nextByte:
2155 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002157 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002158 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002159 Py_XDECREF(errorHandler);
2160 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002162
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163 onError:
2164 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002165 Py_XDECREF(errorHandler);
2166 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167 return NULL;
2168}
2169
2170PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2171 int size)
2172{
2173 PyObject *repr;
2174 char *p;
2175 char *q;
2176
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002177 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002178
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002179#ifdef Py_UNICODE_WIDE
2180 repr = PyString_FromStringAndSize(NULL, 10 * size);
2181#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002183#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002184 if (repr == NULL)
2185 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002186 if (size == 0)
2187 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188
2189 p = q = PyString_AS_STRING(repr);
2190 while (size-- > 0) {
2191 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002192#ifdef Py_UNICODE_WIDE
2193 /* Map 32-bit characters to '\Uxxxxxxxx' */
2194 if (ch >= 0x10000) {
2195 *p++ = '\\';
2196 *p++ = 'U';
2197 *p++ = hexdigit[(ch >> 28) & 0xf];
2198 *p++ = hexdigit[(ch >> 24) & 0xf];
2199 *p++ = hexdigit[(ch >> 20) & 0xf];
2200 *p++ = hexdigit[(ch >> 16) & 0xf];
2201 *p++ = hexdigit[(ch >> 12) & 0xf];
2202 *p++ = hexdigit[(ch >> 8) & 0xf];
2203 *p++ = hexdigit[(ch >> 4) & 0xf];
2204 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002205 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002206 else
2207#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002208 /* Map 16-bit characters to '\uxxxx' */
2209 if (ch >= 256) {
2210 *p++ = '\\';
2211 *p++ = 'u';
2212 *p++ = hexdigit[(ch >> 12) & 0xf];
2213 *p++ = hexdigit[(ch >> 8) & 0xf];
2214 *p++ = hexdigit[(ch >> 4) & 0xf];
2215 *p++ = hexdigit[ch & 15];
2216 }
2217 /* Copy everything else as-is */
2218 else
2219 *p++ = (char) ch;
2220 }
2221 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002222 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223 return repr;
2224}
2225
2226PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2227{
2228 if (!PyUnicode_Check(unicode)) {
2229 PyErr_BadArgument();
2230 return NULL;
2231 }
2232 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2233 PyUnicode_GET_SIZE(unicode));
2234}
2235
2236/* --- Latin-1 Codec ------------------------------------------------------ */
2237
2238PyObject *PyUnicode_DecodeLatin1(const char *s,
2239 int size,
2240 const char *errors)
2241{
2242 PyUnicodeObject *v;
2243 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002244
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002246 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002247 Py_UNICODE r = *(unsigned char*)s;
2248 return PyUnicode_FromUnicode(&r, 1);
2249 }
2250
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251 v = _PyUnicode_New(size);
2252 if (v == NULL)
2253 goto onError;
2254 if (size == 0)
2255 return (PyObject *)v;
2256 p = PyUnicode_AS_UNICODE(v);
2257 while (size-- > 0)
2258 *p++ = (unsigned char)*s++;
2259 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002260
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 onError:
2262 Py_XDECREF(v);
2263 return NULL;
2264}
2265
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266/* create or adjust a UnicodeEncodeError */
2267static void make_encode_exception(PyObject **exceptionObject,
2268 const char *encoding,
2269 const Py_UNICODE *unicode, int size,
2270 int startpos, int endpos,
2271 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002273 if (*exceptionObject == NULL) {
2274 *exceptionObject = PyUnicodeEncodeError_Create(
2275 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 }
2277 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002278 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2279 goto onError;
2280 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2281 goto onError;
2282 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2283 goto onError;
2284 return;
2285 onError:
2286 Py_DECREF(*exceptionObject);
2287 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288 }
2289}
2290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002291/* raises a UnicodeEncodeError */
2292static void raise_encode_exception(PyObject **exceptionObject,
2293 const char *encoding,
2294 const Py_UNICODE *unicode, int size,
2295 int startpos, int endpos,
2296 const char *reason)
2297{
2298 make_encode_exception(exceptionObject,
2299 encoding, unicode, size, startpos, endpos, reason);
2300 if (*exceptionObject != NULL)
2301 PyCodec_StrictErrors(*exceptionObject);
2302}
2303
2304/* error handling callback helper:
2305 build arguments, call the callback and check the arguments,
2306 put the result into newpos and return the replacement string, which
2307 has to be freed by the caller */
2308static PyObject *unicode_encode_call_errorhandler(const char *errors,
2309 PyObject **errorHandler,
2310 const char *encoding, const char *reason,
2311 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2312 int startpos, int endpos,
2313 int *newpos)
2314{
2315 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2316
2317 PyObject *restuple;
2318 PyObject *resunicode;
2319
2320 if (*errorHandler == NULL) {
2321 *errorHandler = PyCodec_LookupError(errors);
2322 if (*errorHandler == NULL)
2323 return NULL;
2324 }
2325
2326 make_encode_exception(exceptionObject,
2327 encoding, unicode, size, startpos, endpos, reason);
2328 if (*exceptionObject == NULL)
2329 return NULL;
2330
2331 restuple = PyObject_CallFunctionObjArgs(
2332 *errorHandler, *exceptionObject, NULL);
2333 if (restuple == NULL)
2334 return NULL;
2335 if (!PyTuple_Check(restuple)) {
2336 PyErr_Format(PyExc_TypeError, &argparse[4]);
2337 Py_DECREF(restuple);
2338 return NULL;
2339 }
2340 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2341 &resunicode, newpos)) {
2342 Py_DECREF(restuple);
2343 return NULL;
2344 }
2345 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002346 *newpos = size+*newpos;
2347 if (*newpos<0 || *newpos>size) {
2348 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2349 Py_DECREF(restuple);
2350 return NULL;
2351 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002352 Py_INCREF(resunicode);
2353 Py_DECREF(restuple);
2354 return resunicode;
2355}
2356
2357static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2358 int size,
2359 const char *errors,
2360 int limit)
2361{
2362 /* output object */
2363 PyObject *res;
2364 /* pointers to the beginning and end+1 of input */
2365 const Py_UNICODE *startp = p;
2366 const Py_UNICODE *endp = p + size;
2367 /* pointer to the beginning of the unencodable characters */
2368 /* const Py_UNICODE *badp = NULL; */
2369 /* pointer into the output */
2370 char *str;
2371 /* current output position */
2372 int respos = 0;
2373 int ressize;
2374 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2375 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2376 PyObject *errorHandler = NULL;
2377 PyObject *exc = NULL;
2378 /* the following variable is used for caching string comparisons
2379 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2380 int known_errorHandler = -1;
2381
2382 /* allocate enough for a simple encoding without
2383 replacements, if we need more, we'll resize */
2384 res = PyString_FromStringAndSize(NULL, size);
2385 if (res == NULL)
2386 goto onError;
2387 if (size == 0)
2388 return res;
2389 str = PyString_AS_STRING(res);
2390 ressize = size;
2391
2392 while (p<endp) {
2393 Py_UNICODE c = *p;
2394
2395 /* can we encode this? */
2396 if (c<limit) {
2397 /* no overflow check, because we know that the space is enough */
2398 *str++ = (char)c;
2399 ++p;
2400 }
2401 else {
2402 int unicodepos = p-startp;
2403 int requiredsize;
2404 PyObject *repunicode;
2405 int repsize;
2406 int newpos;
2407 int respos;
2408 Py_UNICODE *uni2;
2409 /* startpos for collecting unencodable chars */
2410 const Py_UNICODE *collstart = p;
2411 const Py_UNICODE *collend = p;
2412 /* find all unecodable characters */
2413 while ((collend < endp) && ((*collend)>=limit))
2414 ++collend;
2415 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2416 if (known_errorHandler==-1) {
2417 if ((errors==NULL) || (!strcmp(errors, "strict")))
2418 known_errorHandler = 1;
2419 else if (!strcmp(errors, "replace"))
2420 known_errorHandler = 2;
2421 else if (!strcmp(errors, "ignore"))
2422 known_errorHandler = 3;
2423 else if (!strcmp(errors, "xmlcharrefreplace"))
2424 known_errorHandler = 4;
2425 else
2426 known_errorHandler = 0;
2427 }
2428 switch (known_errorHandler) {
2429 case 1: /* strict */
2430 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2431 goto onError;
2432 case 2: /* replace */
2433 while (collstart++<collend)
2434 *str++ = '?'; /* fall through */
2435 case 3: /* ignore */
2436 p = collend;
2437 break;
2438 case 4: /* xmlcharrefreplace */
2439 respos = str-PyString_AS_STRING(res);
2440 /* determine replacement size (temporarily (mis)uses p) */
2441 for (p = collstart, repsize = 0; p < collend; ++p) {
2442 if (*p<10)
2443 repsize += 2+1+1;
2444 else if (*p<100)
2445 repsize += 2+2+1;
2446 else if (*p<1000)
2447 repsize += 2+3+1;
2448 else if (*p<10000)
2449 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002450#ifndef Py_UNICODE_WIDE
2451 else
2452 repsize += 2+5+1;
2453#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002454 else if (*p<100000)
2455 repsize += 2+5+1;
2456 else if (*p<1000000)
2457 repsize += 2+6+1;
2458 else
2459 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002460#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002461 }
2462 requiredsize = respos+repsize+(endp-collend);
2463 if (requiredsize > ressize) {
2464 if (requiredsize<2*ressize)
2465 requiredsize = 2*ressize;
2466 if (_PyString_Resize(&res, requiredsize))
2467 goto onError;
2468 str = PyString_AS_STRING(res) + respos;
2469 ressize = requiredsize;
2470 }
2471 /* generate replacement (temporarily (mis)uses p) */
2472 for (p = collstart; p < collend; ++p) {
2473 str += sprintf(str, "&#%d;", (int)*p);
2474 }
2475 p = collend;
2476 break;
2477 default:
2478 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2479 encoding, reason, startp, size, &exc,
2480 collstart-startp, collend-startp, &newpos);
2481 if (repunicode == NULL)
2482 goto onError;
2483 /* need more space? (at least enough for what we
2484 have+the replacement+the rest of the string, so
2485 we won't have to check space for encodable characters) */
2486 respos = str-PyString_AS_STRING(res);
2487 repsize = PyUnicode_GET_SIZE(repunicode);
2488 requiredsize = respos+repsize+(endp-collend);
2489 if (requiredsize > ressize) {
2490 if (requiredsize<2*ressize)
2491 requiredsize = 2*ressize;
2492 if (_PyString_Resize(&res, requiredsize)) {
2493 Py_DECREF(repunicode);
2494 goto onError;
2495 }
2496 str = PyString_AS_STRING(res) + respos;
2497 ressize = requiredsize;
2498 }
2499 /* check if there is anything unencodable in the replacement
2500 and copy it to the output */
2501 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2502 c = *uni2;
2503 if (c >= limit) {
2504 raise_encode_exception(&exc, encoding, startp, size,
2505 unicodepos, unicodepos+1, reason);
2506 Py_DECREF(repunicode);
2507 goto onError;
2508 }
2509 *str = (char)c;
2510 }
2511 p = startp + newpos;
2512 Py_DECREF(repunicode);
2513 }
2514 }
2515 }
2516 /* Resize if we allocated to much */
2517 respos = str-PyString_AS_STRING(res);
2518 if (respos<ressize)
2519 /* If this falls res will be NULL */
2520 _PyString_Resize(&res, respos);
2521 Py_XDECREF(errorHandler);
2522 Py_XDECREF(exc);
2523 return res;
2524
2525 onError:
2526 Py_XDECREF(res);
2527 Py_XDECREF(errorHandler);
2528 Py_XDECREF(exc);
2529 return NULL;
2530}
2531
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2533 int size,
2534 const char *errors)
2535{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002536 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537}
2538
2539PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2540{
2541 if (!PyUnicode_Check(unicode)) {
2542 PyErr_BadArgument();
2543 return NULL;
2544 }
2545 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2546 PyUnicode_GET_SIZE(unicode),
2547 NULL);
2548}
2549
2550/* --- 7-bit ASCII Codec -------------------------------------------------- */
2551
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552PyObject *PyUnicode_DecodeASCII(const char *s,
2553 int size,
2554 const char *errors)
2555{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002556 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 PyUnicodeObject *v;
2558 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002559 int startinpos;
2560 int endinpos;
2561 int outpos;
2562 const char *e;
2563 PyObject *errorHandler = NULL;
2564 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002565
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002567 if (size == 1 && *(unsigned char*)s < 128) {
2568 Py_UNICODE r = *(unsigned char*)s;
2569 return PyUnicode_FromUnicode(&r, 1);
2570 }
Tim Petersced69f82003-09-16 20:30:58 +00002571
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572 v = _PyUnicode_New(size);
2573 if (v == NULL)
2574 goto onError;
2575 if (size == 0)
2576 return (PyObject *)v;
2577 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002578 e = s + size;
2579 while (s < e) {
2580 register unsigned char c = (unsigned char)*s;
2581 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002583 ++s;
2584 }
2585 else {
2586 startinpos = s-starts;
2587 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002588 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002589 if (unicode_decode_call_errorhandler(
2590 errors, &errorHandler,
2591 "ascii", "ordinal not in range(128)",
2592 starts, size, &startinpos, &endinpos, &exc, &s,
2593 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002597 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002598 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002599 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002600 Py_XDECREF(errorHandler);
2601 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002603
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 onError:
2605 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002606 Py_XDECREF(errorHandler);
2607 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608 return NULL;
2609}
2610
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2612 int size,
2613 const char *errors)
2614{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002615 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616}
2617
2618PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2619{
2620 if (!PyUnicode_Check(unicode)) {
2621 PyErr_BadArgument();
2622 return NULL;
2623 }
2624 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2625 PyUnicode_GET_SIZE(unicode),
2626 NULL);
2627}
2628
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002629#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002630
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002631/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002632
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002633PyObject *PyUnicode_DecodeMBCS(const char *s,
2634 int size,
2635 const char *errors)
2636{
2637 PyUnicodeObject *v;
2638 Py_UNICODE *p;
2639
2640 /* First get the size of the result */
2641 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002642 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002643 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2644
2645 v = _PyUnicode_New(usize);
2646 if (v == NULL)
2647 return NULL;
2648 if (usize == 0)
2649 return (PyObject *)v;
2650 p = PyUnicode_AS_UNICODE(v);
2651 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2652 Py_DECREF(v);
2653 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2654 }
2655
2656 return (PyObject *)v;
2657}
2658
2659PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2660 int size,
2661 const char *errors)
2662{
2663 PyObject *repr;
2664 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002665 DWORD mbcssize;
2666
2667 /* If there are no characters, bail now! */
2668 if (size==0)
2669 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002670
2671 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002672 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002673 if (mbcssize==0)
2674 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2675
2676 repr = PyString_FromStringAndSize(NULL, mbcssize);
2677 if (repr == NULL)
2678 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002679 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002680 return repr;
2681
2682 /* Do the conversion */
2683 s = PyString_AS_STRING(repr);
2684 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2685 Py_DECREF(repr);
2686 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2687 }
2688 return repr;
2689}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002690
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002691PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2692{
2693 if (!PyUnicode_Check(unicode)) {
2694 PyErr_BadArgument();
2695 return NULL;
2696 }
2697 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2698 PyUnicode_GET_SIZE(unicode),
2699 NULL);
2700}
2701
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002702#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002703
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704/* --- Character Mapping Codec -------------------------------------------- */
2705
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706PyObject *PyUnicode_DecodeCharmap(const char *s,
2707 int size,
2708 PyObject *mapping,
2709 const char *errors)
2710{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002711 const char *starts = s;
2712 int startinpos;
2713 int endinpos;
2714 int outpos;
2715 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716 PyUnicodeObject *v;
2717 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002718 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002719 PyObject *errorHandler = NULL;
2720 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002721
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722 /* Default to Latin-1 */
2723 if (mapping == NULL)
2724 return PyUnicode_DecodeLatin1(s, size, errors);
2725
2726 v = _PyUnicode_New(size);
2727 if (v == NULL)
2728 goto onError;
2729 if (size == 0)
2730 return (PyObject *)v;
2731 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002732 e = s + size;
2733 while (s < e) {
2734 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 PyObject *w, *x;
2736
2737 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2738 w = PyInt_FromLong((long)ch);
2739 if (w == NULL)
2740 goto onError;
2741 x = PyObject_GetItem(mapping, w);
2742 Py_DECREF(w);
2743 if (x == NULL) {
2744 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002745 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002747 x = Py_None;
2748 Py_INCREF(x);
2749 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002750 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 }
2752
2753 /* Apply mapping */
2754 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002755 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 if (value < 0 || value > 65535) {
2757 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002758 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 Py_DECREF(x);
2760 goto onError;
2761 }
2762 *p++ = (Py_UNICODE)value;
2763 }
2764 else if (x == Py_None) {
2765 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002766 outpos = p-PyUnicode_AS_UNICODE(v);
2767 startinpos = s-starts;
2768 endinpos = startinpos+1;
2769 if (unicode_decode_call_errorhandler(
2770 errors, &errorHandler,
2771 "charmap", "character maps to <undefined>",
2772 starts, size, &startinpos, &endinpos, &exc, &s,
2773 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774 Py_DECREF(x);
2775 goto onError;
2776 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002777 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778 }
2779 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002780 int targetsize = PyUnicode_GET_SIZE(x);
2781
2782 if (targetsize == 1)
2783 /* 1-1 mapping */
2784 *p++ = *PyUnicode_AS_UNICODE(x);
2785
2786 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002788 if (targetsize > extrachars) {
2789 /* resize first */
2790 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2791 int needed = (targetsize - extrachars) + \
2792 (targetsize << 2);
2793 extrachars += needed;
Tim Petersced69f82003-09-16 20:30:58 +00002794 if (_PyUnicode_Resize(&v,
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002795 PyUnicode_GET_SIZE(v) + needed) < 0) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002796 Py_DECREF(x);
2797 goto onError;
2798 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002799 p = PyUnicode_AS_UNICODE(v) + oldpos;
2800 }
2801 Py_UNICODE_COPY(p,
2802 PyUnicode_AS_UNICODE(x),
2803 targetsize);
2804 p += targetsize;
2805 extrachars -= targetsize;
2806 }
2807 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 }
2809 else {
2810 /* wrong return value */
2811 PyErr_SetString(PyExc_TypeError,
2812 "character mapping must return integer, None or unicode");
2813 Py_DECREF(x);
2814 goto onError;
2815 }
2816 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002817 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818 }
2819 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002820 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822 Py_XDECREF(errorHandler);
2823 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002825
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002827 Py_XDECREF(errorHandler);
2828 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829 Py_XDECREF(v);
2830 return NULL;
2831}
2832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002833/* Lookup the character ch in the mapping. If the character
2834 can't be found, Py_None is returned (or NULL, if another
2835 error occured). */
2836static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002838 PyObject *w = PyInt_FromLong((long)c);
2839 PyObject *x;
2840
2841 if (w == NULL)
2842 return NULL;
2843 x = PyObject_GetItem(mapping, w);
2844 Py_DECREF(w);
2845 if (x == NULL) {
2846 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2847 /* No mapping found means: mapping is undefined. */
2848 PyErr_Clear();
2849 x = Py_None;
2850 Py_INCREF(x);
2851 return x;
2852 } else
2853 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002855 else if (x == Py_None)
2856 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002857 else if (PyInt_Check(x)) {
2858 long value = PyInt_AS_LONG(x);
2859 if (value < 0 || value > 255) {
2860 PyErr_SetString(PyExc_TypeError,
2861 "character mapping must be in range(256)");
2862 Py_DECREF(x);
2863 return NULL;
2864 }
2865 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002867 else if (PyString_Check(x))
2868 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002870 /* wrong return value */
2871 PyErr_SetString(PyExc_TypeError,
2872 "character mapping must return integer, None or str");
2873 Py_DECREF(x);
2874 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 }
2876}
2877
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002878/* lookup the character, put the result in the output string and adjust
2879 various state variables. Reallocate the output string if not enough
2880 space is available. Return a new reference to the object that
2881 was put in the output buffer, or Py_None, if the mapping was undefined
2882 (in which case no character was written) or NULL, if a
2883 reallocation error ocurred. The called must decref the result */
2884static
2885PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2886 PyObject **outobj, int *outpos)
2887{
2888 PyObject *rep = charmapencode_lookup(c, mapping);
2889
2890 if (rep==NULL)
2891 return NULL;
2892 else if (rep==Py_None)
2893 return rep;
2894 else {
2895 char *outstart = PyString_AS_STRING(*outobj);
2896 int outsize = PyString_GET_SIZE(*outobj);
2897 if (PyInt_Check(rep)) {
2898 int requiredsize = *outpos+1;
2899 if (outsize<requiredsize) {
2900 /* exponentially overallocate to minimize reallocations */
2901 if (requiredsize < 2*outsize)
2902 requiredsize = 2*outsize;
2903 if (_PyString_Resize(outobj, requiredsize)) {
2904 Py_DECREF(rep);
2905 return NULL;
2906 }
2907 outstart = PyString_AS_STRING(*outobj);
2908 }
2909 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2910 }
2911 else {
2912 const char *repchars = PyString_AS_STRING(rep);
2913 int repsize = PyString_GET_SIZE(rep);
2914 int requiredsize = *outpos+repsize;
2915 if (outsize<requiredsize) {
2916 /* exponentially overallocate to minimize reallocations */
2917 if (requiredsize < 2*outsize)
2918 requiredsize = 2*outsize;
2919 if (_PyString_Resize(outobj, requiredsize)) {
2920 Py_DECREF(rep);
2921 return NULL;
2922 }
2923 outstart = PyString_AS_STRING(*outobj);
2924 }
2925 memcpy(outstart + *outpos, repchars, repsize);
2926 *outpos += repsize;
2927 }
2928 }
2929 return rep;
2930}
2931
2932/* handle an error in PyUnicode_EncodeCharmap
2933 Return 0 on success, -1 on error */
2934static
2935int charmap_encoding_error(
2936 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2937 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002938 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002939 PyObject **res, int *respos)
2940{
2941 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2942 int repsize;
2943 int newpos;
2944 Py_UNICODE *uni2;
2945 /* startpos for collecting unencodable chars */
2946 int collstartpos = *inpos;
2947 int collendpos = *inpos+1;
2948 int collpos;
2949 char *encoding = "charmap";
2950 char *reason = "character maps to <undefined>";
2951
2952 PyObject *x;
2953 /* find all unencodable characters */
2954 while (collendpos < size) {
2955 x = charmapencode_lookup(p[collendpos], mapping);
2956 if (x==NULL)
2957 return -1;
2958 else if (x!=Py_None) {
2959 Py_DECREF(x);
2960 break;
2961 }
2962 Py_DECREF(x);
2963 ++collendpos;
2964 }
2965 /* cache callback name lookup
2966 * (if not done yet, i.e. it's the first error) */
2967 if (*known_errorHandler==-1) {
2968 if ((errors==NULL) || (!strcmp(errors, "strict")))
2969 *known_errorHandler = 1;
2970 else if (!strcmp(errors, "replace"))
2971 *known_errorHandler = 2;
2972 else if (!strcmp(errors, "ignore"))
2973 *known_errorHandler = 3;
2974 else if (!strcmp(errors, "xmlcharrefreplace"))
2975 *known_errorHandler = 4;
2976 else
2977 *known_errorHandler = 0;
2978 }
2979 switch (*known_errorHandler) {
2980 case 1: /* strict */
2981 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2982 return -1;
2983 case 2: /* replace */
2984 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2985 x = charmapencode_output('?', mapping, res, respos);
2986 if (x==NULL) {
2987 return -1;
2988 }
2989 else if (x==Py_None) {
2990 Py_DECREF(x);
2991 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2992 return -1;
2993 }
2994 Py_DECREF(x);
2995 }
2996 /* fall through */
2997 case 3: /* ignore */
2998 *inpos = collendpos;
2999 break;
3000 case 4: /* xmlcharrefreplace */
3001 /* generate replacement (temporarily (mis)uses p) */
3002 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3003 char buffer[2+29+1+1];
3004 char *cp;
3005 sprintf(buffer, "&#%d;", (int)p[collpos]);
3006 for (cp = buffer; *cp; ++cp) {
3007 x = charmapencode_output(*cp, mapping, res, respos);
3008 if (x==NULL)
3009 return -1;
3010 else if (x==Py_None) {
3011 Py_DECREF(x);
3012 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3013 return -1;
3014 }
3015 Py_DECREF(x);
3016 }
3017 }
3018 *inpos = collendpos;
3019 break;
3020 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003021 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003022 encoding, reason, p, size, exceptionObject,
3023 collstartpos, collendpos, &newpos);
3024 if (repunicode == NULL)
3025 return -1;
3026 /* generate replacement */
3027 repsize = PyUnicode_GET_SIZE(repunicode);
3028 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3029 x = charmapencode_output(*uni2, mapping, res, respos);
3030 if (x==NULL) {
3031 Py_DECREF(repunicode);
3032 return -1;
3033 }
3034 else if (x==Py_None) {
3035 Py_DECREF(repunicode);
3036 Py_DECREF(x);
3037 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3038 return -1;
3039 }
3040 Py_DECREF(x);
3041 }
3042 *inpos = newpos;
3043 Py_DECREF(repunicode);
3044 }
3045 return 0;
3046}
3047
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3049 int size,
3050 PyObject *mapping,
3051 const char *errors)
3052{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 /* output object */
3054 PyObject *res = NULL;
3055 /* current input position */
3056 int inpos = 0;
3057 /* current output position */
3058 int respos = 0;
3059 PyObject *errorHandler = NULL;
3060 PyObject *exc = NULL;
3061 /* the following variable is used for caching string comparisons
3062 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3063 * 3=ignore, 4=xmlcharrefreplace */
3064 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065
3066 /* Default to Latin-1 */
3067 if (mapping == NULL)
3068 return PyUnicode_EncodeLatin1(p, size, errors);
3069
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070 /* allocate enough for a simple encoding without
3071 replacements, if we need more, we'll resize */
3072 res = PyString_FromStringAndSize(NULL, size);
3073 if (res == NULL)
3074 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003075 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003076 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003077
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003078 while (inpos<size) {
3079 /* try to encode it */
3080 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3081 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003083 if (x==Py_None) { /* unencodable character */
3084 if (charmap_encoding_error(p, size, &inpos, mapping,
3085 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003086 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003087 &res, &respos)) {
3088 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003089 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003092 else
3093 /* done with this character => adjust input position */
3094 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095 Py_DECREF(x);
3096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003098 /* Resize if we allocated to much */
3099 if (respos<PyString_GET_SIZE(res)) {
3100 if (_PyString_Resize(&res, respos))
3101 goto onError;
3102 }
3103 Py_XDECREF(exc);
3104 Py_XDECREF(errorHandler);
3105 return res;
3106
3107 onError:
3108 Py_XDECREF(res);
3109 Py_XDECREF(exc);
3110 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111 return NULL;
3112}
3113
3114PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3115 PyObject *mapping)
3116{
3117 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3118 PyErr_BadArgument();
3119 return NULL;
3120 }
3121 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3122 PyUnicode_GET_SIZE(unicode),
3123 mapping,
3124 NULL);
3125}
3126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003127/* create or adjust a UnicodeTranslateError */
3128static void make_translate_exception(PyObject **exceptionObject,
3129 const Py_UNICODE *unicode, int size,
3130 int startpos, int endpos,
3131 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003133 if (*exceptionObject == NULL) {
3134 *exceptionObject = PyUnicodeTranslateError_Create(
3135 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136 }
3137 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003138 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3139 goto onError;
3140 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3141 goto onError;
3142 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3143 goto onError;
3144 return;
3145 onError:
3146 Py_DECREF(*exceptionObject);
3147 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148 }
3149}
3150
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003151/* raises a UnicodeTranslateError */
3152static void raise_translate_exception(PyObject **exceptionObject,
3153 const Py_UNICODE *unicode, int size,
3154 int startpos, int endpos,
3155 const char *reason)
3156{
3157 make_translate_exception(exceptionObject,
3158 unicode, size, startpos, endpos, reason);
3159 if (*exceptionObject != NULL)
3160 PyCodec_StrictErrors(*exceptionObject);
3161}
3162
3163/* error handling callback helper:
3164 build arguments, call the callback and check the arguments,
3165 put the result into newpos and return the replacement string, which
3166 has to be freed by the caller */
3167static PyObject *unicode_translate_call_errorhandler(const char *errors,
3168 PyObject **errorHandler,
3169 const char *reason,
3170 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3171 int startpos, int endpos,
3172 int *newpos)
3173{
3174 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3175
3176 PyObject *restuple;
3177 PyObject *resunicode;
3178
3179 if (*errorHandler == NULL) {
3180 *errorHandler = PyCodec_LookupError(errors);
3181 if (*errorHandler == NULL)
3182 return NULL;
3183 }
3184
3185 make_translate_exception(exceptionObject,
3186 unicode, size, startpos, endpos, reason);
3187 if (*exceptionObject == NULL)
3188 return NULL;
3189
3190 restuple = PyObject_CallFunctionObjArgs(
3191 *errorHandler, *exceptionObject, NULL);
3192 if (restuple == NULL)
3193 return NULL;
3194 if (!PyTuple_Check(restuple)) {
3195 PyErr_Format(PyExc_TypeError, &argparse[4]);
3196 Py_DECREF(restuple);
3197 return NULL;
3198 }
3199 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3200 &resunicode, newpos)) {
3201 Py_DECREF(restuple);
3202 return NULL;
3203 }
3204 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003205 *newpos = size+*newpos;
3206 if (*newpos<0 || *newpos>size) {
3207 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3208 Py_DECREF(restuple);
3209 return NULL;
3210 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003211 Py_INCREF(resunicode);
3212 Py_DECREF(restuple);
3213 return resunicode;
3214}
3215
3216/* Lookup the character ch in the mapping and put the result in result,
3217 which must be decrefed by the caller.
3218 Return 0 on success, -1 on error */
3219static
3220int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3221{
3222 PyObject *w = PyInt_FromLong((long)c);
3223 PyObject *x;
3224
3225 if (w == NULL)
3226 return -1;
3227 x = PyObject_GetItem(mapping, w);
3228 Py_DECREF(w);
3229 if (x == NULL) {
3230 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3231 /* No mapping found means: use 1:1 mapping. */
3232 PyErr_Clear();
3233 *result = NULL;
3234 return 0;
3235 } else
3236 return -1;
3237 }
3238 else if (x == Py_None) {
3239 *result = x;
3240 return 0;
3241 }
3242 else if (PyInt_Check(x)) {
3243 long value = PyInt_AS_LONG(x);
3244 long max = PyUnicode_GetMax();
3245 if (value < 0 || value > max) {
3246 PyErr_Format(PyExc_TypeError,
3247 "character mapping must be in range(0x%lx)", max+1);
3248 Py_DECREF(x);
3249 return -1;
3250 }
3251 *result = x;
3252 return 0;
3253 }
3254 else if (PyUnicode_Check(x)) {
3255 *result = x;
3256 return 0;
3257 }
3258 else {
3259 /* wrong return value */
3260 PyErr_SetString(PyExc_TypeError,
3261 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003262 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003263 return -1;
3264 }
3265}
3266/* ensure that *outobj is at least requiredsize characters long,
3267if not reallocate and adjust various state variables.
3268Return 0 on success, -1 on error */
3269static
Walter Dörwald4894c302003-10-24 14:25:28 +00003270int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003271 int requiredsize)
3272{
Walter Dörwald4894c302003-10-24 14:25:28 +00003273 int oldsize = PyUnicode_GET_SIZE(*outobj);
3274 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003275 /* remember old output position */
3276 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3277 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003278 if (requiredsize < 2 * oldsize)
3279 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003280 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003281 return -1;
3282 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003283 }
3284 return 0;
3285}
3286/* lookup the character, put the result in the output string and adjust
3287 various state variables. Return a new reference to the object that
3288 was put in the output buffer in *result, or Py_None, if the mapping was
3289 undefined (in which case no character was written).
3290 The called must decref result.
3291 Return 0 on success, -1 on error. */
3292static
Walter Dörwald4894c302003-10-24 14:25:28 +00003293int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3294 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3295 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296{
Walter Dörwald4894c302003-10-24 14:25:28 +00003297 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003298 return -1;
3299 if (*res==NULL) {
3300 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003301 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003302 }
3303 else if (*res==Py_None)
3304 ;
3305 else if (PyInt_Check(*res)) {
3306 /* no overflow check, because we know that the space is enough */
3307 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3308 }
3309 else if (PyUnicode_Check(*res)) {
3310 int repsize = PyUnicode_GET_SIZE(*res);
3311 if (repsize==1) {
3312 /* no overflow check, because we know that the space is enough */
3313 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3314 }
3315 else if (repsize!=0) {
3316 /* more than one character */
Walter Dörwald4894c302003-10-24 14:25:28 +00003317 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003318 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003319 repsize - 1;
3320 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003321 return -1;
3322 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3323 *outp += repsize;
3324 }
3325 }
3326 else
3327 return -1;
3328 return 0;
3329}
3330
3331PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 int size,
3333 PyObject *mapping,
3334 const char *errors)
3335{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003336 /* output object */
3337 PyObject *res = NULL;
3338 /* pointers to the beginning and end+1 of input */
3339 const Py_UNICODE *startp = p;
3340 const Py_UNICODE *endp = p + size;
3341 /* pointer into the output */
3342 Py_UNICODE *str;
3343 /* current output position */
3344 int respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003345 char *reason = "character maps to <undefined>";
3346 PyObject *errorHandler = NULL;
3347 PyObject *exc = NULL;
3348 /* the following variable is used for caching string comparisons
3349 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3350 * 3=ignore, 4=xmlcharrefreplace */
3351 int known_errorHandler = -1;
3352
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 if (mapping == NULL) {
3354 PyErr_BadArgument();
3355 return NULL;
3356 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003357
3358 /* allocate enough for a simple 1:1 translation without
3359 replacements, if we need more, we'll resize */
3360 res = PyUnicode_FromUnicode(NULL, size);
3361 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003362 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003363 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003364 return res;
3365 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003367 while (p<endp) {
3368 /* try to encode it */
3369 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003370 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003371 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372 goto onError;
3373 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003374 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003375 if (x!=Py_None) /* it worked => adjust input pointer */
3376 ++p;
3377 else { /* untranslatable character */
3378 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3379 int repsize;
3380 int newpos;
3381 Py_UNICODE *uni2;
3382 /* startpos for collecting untranslatable chars */
3383 const Py_UNICODE *collstart = p;
3384 const Py_UNICODE *collend = p+1;
3385 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003386
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003387 /* find all untranslatable characters */
3388 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003389 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390 goto onError;
3391 Py_XDECREF(x);
3392 if (x!=Py_None)
3393 break;
3394 ++collend;
3395 }
3396 /* cache callback name lookup
3397 * (if not done yet, i.e. it's the first error) */
3398 if (known_errorHandler==-1) {
3399 if ((errors==NULL) || (!strcmp(errors, "strict")))
3400 known_errorHandler = 1;
3401 else if (!strcmp(errors, "replace"))
3402 known_errorHandler = 2;
3403 else if (!strcmp(errors, "ignore"))
3404 known_errorHandler = 3;
3405 else if (!strcmp(errors, "xmlcharrefreplace"))
3406 known_errorHandler = 4;
3407 else
3408 known_errorHandler = 0;
3409 }
3410 switch (known_errorHandler) {
3411 case 1: /* strict */
3412 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3413 goto onError;
3414 case 2: /* replace */
3415 /* No need to check for space, this is a 1:1 replacement */
3416 for (coll = collstart; coll<collend; ++coll)
3417 *str++ = '?';
3418 /* fall through */
3419 case 3: /* ignore */
3420 p = collend;
3421 break;
3422 case 4: /* xmlcharrefreplace */
3423 /* generate replacement (temporarily (mis)uses p) */
3424 for (p = collstart; p < collend; ++p) {
3425 char buffer[2+29+1+1];
3426 char *cp;
3427 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003428 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3430 goto onError;
3431 for (cp = buffer; *cp; ++cp)
3432 *str++ = *cp;
3433 }
3434 p = collend;
3435 break;
3436 default:
3437 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3438 reason, startp, size, &exc,
3439 collstart-startp, collend-startp, &newpos);
3440 if (repunicode == NULL)
3441 goto onError;
3442 /* generate replacement */
3443 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003444 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3446 Py_DECREF(repunicode);
3447 goto onError;
3448 }
3449 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3450 *str++ = *uni2;
3451 p = startp + newpos;
3452 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003453 }
3454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003456 /* Resize if we allocated to much */
3457 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003458 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003459 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003460 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003461 }
3462 Py_XDECREF(exc);
3463 Py_XDECREF(errorHandler);
3464 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466 onError:
3467 Py_XDECREF(res);
3468 Py_XDECREF(exc);
3469 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 return NULL;
3471}
3472
3473PyObject *PyUnicode_Translate(PyObject *str,
3474 PyObject *mapping,
3475 const char *errors)
3476{
3477 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003478
Guido van Rossumd57fd912000-03-10 22:53:23 +00003479 str = PyUnicode_FromObject(str);
3480 if (str == NULL)
3481 goto onError;
3482 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3483 PyUnicode_GET_SIZE(str),
3484 mapping,
3485 errors);
3486 Py_DECREF(str);
3487 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003488
Guido van Rossumd57fd912000-03-10 22:53:23 +00003489 onError:
3490 Py_XDECREF(str);
3491 return NULL;
3492}
Tim Petersced69f82003-09-16 20:30:58 +00003493
Guido van Rossum9e896b32000-04-05 20:11:21 +00003494/* --- Decimal Encoder ---------------------------------------------------- */
3495
3496int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3497 int length,
3498 char *output,
3499 const char *errors)
3500{
3501 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502 PyObject *errorHandler = NULL;
3503 PyObject *exc = NULL;
3504 const char *encoding = "decimal";
3505 const char *reason = "invalid decimal Unicode string";
3506 /* the following variable is used for caching string comparisons
3507 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3508 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003509
3510 if (output == NULL) {
3511 PyErr_BadArgument();
3512 return -1;
3513 }
3514
3515 p = s;
3516 end = s + length;
3517 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003519 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520 PyObject *repunicode;
3521 int repsize;
3522 int newpos;
3523 Py_UNICODE *uni2;
3524 Py_UNICODE *collstart;
3525 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003526
Guido van Rossum9e896b32000-04-05 20:11:21 +00003527 if (Py_UNICODE_ISSPACE(ch)) {
3528 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003530 continue;
3531 }
3532 decimal = Py_UNICODE_TODECIMAL(ch);
3533 if (decimal >= 0) {
3534 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003536 continue;
3537 }
Guido van Rossumba477042000-04-06 18:18:10 +00003538 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003539 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003540 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003541 continue;
3542 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 /* All other characters are considered unencodable */
3544 collstart = p;
3545 collend = p+1;
3546 while (collend < end) {
3547 if ((0 < *collend && *collend < 256) ||
3548 !Py_UNICODE_ISSPACE(*collend) ||
3549 Py_UNICODE_TODECIMAL(*collend))
3550 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003551 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003552 /* cache callback name lookup
3553 * (if not done yet, i.e. it's the first error) */
3554 if (known_errorHandler==-1) {
3555 if ((errors==NULL) || (!strcmp(errors, "strict")))
3556 known_errorHandler = 1;
3557 else if (!strcmp(errors, "replace"))
3558 known_errorHandler = 2;
3559 else if (!strcmp(errors, "ignore"))
3560 known_errorHandler = 3;
3561 else if (!strcmp(errors, "xmlcharrefreplace"))
3562 known_errorHandler = 4;
3563 else
3564 known_errorHandler = 0;
3565 }
3566 switch (known_errorHandler) {
3567 case 1: /* strict */
3568 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3569 goto onError;
3570 case 2: /* replace */
3571 for (p = collstart; p < collend; ++p)
3572 *output++ = '?';
3573 /* fall through */
3574 case 3: /* ignore */
3575 p = collend;
3576 break;
3577 case 4: /* xmlcharrefreplace */
3578 /* generate replacement (temporarily (mis)uses p) */
3579 for (p = collstart; p < collend; ++p)
3580 output += sprintf(output, "&#%d;", (int)*p);
3581 p = collend;
3582 break;
3583 default:
3584 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3585 encoding, reason, s, length, &exc,
3586 collstart-s, collend-s, &newpos);
3587 if (repunicode == NULL)
3588 goto onError;
3589 /* generate replacement */
3590 repsize = PyUnicode_GET_SIZE(repunicode);
3591 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3592 Py_UNICODE ch = *uni2;
3593 if (Py_UNICODE_ISSPACE(ch))
3594 *output++ = ' ';
3595 else {
3596 decimal = Py_UNICODE_TODECIMAL(ch);
3597 if (decimal >= 0)
3598 *output++ = '0' + decimal;
3599 else if (0 < ch && ch < 256)
3600 *output++ = (char)ch;
3601 else {
3602 Py_DECREF(repunicode);
3603 raise_encode_exception(&exc, encoding,
3604 s, length, collstart-s, collend-s, reason);
3605 goto onError;
3606 }
3607 }
3608 }
3609 p = s + newpos;
3610 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003611 }
3612 }
3613 /* 0-terminate the output string */
3614 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003615 Py_XDECREF(exc);
3616 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003617 return 0;
3618
3619 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 Py_XDECREF(exc);
3621 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003622 return -1;
3623}
3624
Guido van Rossumd57fd912000-03-10 22:53:23 +00003625/* --- Helpers ------------------------------------------------------------ */
3626
Tim Petersced69f82003-09-16 20:30:58 +00003627static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628int count(PyUnicodeObject *self,
3629 int start,
3630 int end,
3631 PyUnicodeObject *substring)
3632{
3633 int count = 0;
3634
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003635 if (start < 0)
3636 start += self->length;
3637 if (start < 0)
3638 start = 0;
3639 if (end > self->length)
3640 end = self->length;
3641 if (end < 0)
3642 end += self->length;
3643 if (end < 0)
3644 end = 0;
3645
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003646 if (substring->length == 0)
3647 return (end - start + 1);
3648
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649 end -= substring->length;
3650
3651 while (start <= end)
3652 if (Py_UNICODE_MATCH(self, start, substring)) {
3653 count++;
3654 start += substring->length;
3655 } else
3656 start++;
3657
3658 return count;
3659}
3660
3661int PyUnicode_Count(PyObject *str,
3662 PyObject *substr,
3663 int start,
3664 int end)
3665{
3666 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003667
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 str = PyUnicode_FromObject(str);
3669 if (str == NULL)
3670 return -1;
3671 substr = PyUnicode_FromObject(substr);
3672 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003673 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674 return -1;
3675 }
Tim Petersced69f82003-09-16 20:30:58 +00003676
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 result = count((PyUnicodeObject *)str,
3678 start, end,
3679 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003680
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681 Py_DECREF(str);
3682 Py_DECREF(substr);
3683 return result;
3684}
3685
Tim Petersced69f82003-09-16 20:30:58 +00003686static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687int findstring(PyUnicodeObject *self,
3688 PyUnicodeObject *substring,
3689 int start,
3690 int end,
3691 int direction)
3692{
3693 if (start < 0)
3694 start += self->length;
3695 if (start < 0)
3696 start = 0;
3697
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698 if (end > self->length)
3699 end = self->length;
3700 if (end < 0)
3701 end += self->length;
3702 if (end < 0)
3703 end = 0;
3704
Guido van Rossum76afbd92002-08-20 17:29:29 +00003705 if (substring->length == 0)
3706 return (direction > 0) ? start : end;
3707
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 end -= substring->length;
3709
3710 if (direction < 0) {
3711 for (; end >= start; end--)
3712 if (Py_UNICODE_MATCH(self, end, substring))
3713 return end;
3714 } else {
3715 for (; start <= end; start++)
3716 if (Py_UNICODE_MATCH(self, start, substring))
3717 return start;
3718 }
3719
3720 return -1;
3721}
3722
3723int PyUnicode_Find(PyObject *str,
3724 PyObject *substr,
3725 int start,
3726 int end,
3727 int direction)
3728{
3729 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003730
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 str = PyUnicode_FromObject(str);
3732 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003733 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003734 substr = PyUnicode_FromObject(substr);
3735 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003736 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003737 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738 }
Tim Petersced69f82003-09-16 20:30:58 +00003739
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740 result = findstring((PyUnicodeObject *)str,
3741 (PyUnicodeObject *)substr,
3742 start, end, direction);
3743 Py_DECREF(str);
3744 Py_DECREF(substr);
3745 return result;
3746}
3747
Tim Petersced69f82003-09-16 20:30:58 +00003748static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749int tailmatch(PyUnicodeObject *self,
3750 PyUnicodeObject *substring,
3751 int start,
3752 int end,
3753 int direction)
3754{
3755 if (start < 0)
3756 start += self->length;
3757 if (start < 0)
3758 start = 0;
3759
3760 if (substring->length == 0)
3761 return 1;
3762
3763 if (end > self->length)
3764 end = self->length;
3765 if (end < 0)
3766 end += self->length;
3767 if (end < 0)
3768 end = 0;
3769
3770 end -= substring->length;
3771 if (end < start)
3772 return 0;
3773
3774 if (direction > 0) {
3775 if (Py_UNICODE_MATCH(self, end, substring))
3776 return 1;
3777 } else {
3778 if (Py_UNICODE_MATCH(self, start, substring))
3779 return 1;
3780 }
3781
3782 return 0;
3783}
3784
3785int PyUnicode_Tailmatch(PyObject *str,
3786 PyObject *substr,
3787 int start,
3788 int end,
3789 int direction)
3790{
3791 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003792
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793 str = PyUnicode_FromObject(str);
3794 if (str == NULL)
3795 return -1;
3796 substr = PyUnicode_FromObject(substr);
3797 if (substr == NULL) {
3798 Py_DECREF(substr);
3799 return -1;
3800 }
Tim Petersced69f82003-09-16 20:30:58 +00003801
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802 result = tailmatch((PyUnicodeObject *)str,
3803 (PyUnicodeObject *)substr,
3804 start, end, direction);
3805 Py_DECREF(str);
3806 Py_DECREF(substr);
3807 return result;
3808}
3809
Tim Petersced69f82003-09-16 20:30:58 +00003810static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811const Py_UNICODE *findchar(const Py_UNICODE *s,
3812 int size,
3813 Py_UNICODE ch)
3814{
3815 /* like wcschr, but doesn't stop at NULL characters */
3816
3817 while (size-- > 0) {
3818 if (*s == ch)
3819 return s;
3820 s++;
3821 }
3822
3823 return NULL;
3824}
3825
3826/* Apply fixfct filter to the Unicode object self and return a
3827 reference to the modified object */
3828
Tim Petersced69f82003-09-16 20:30:58 +00003829static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830PyObject *fixup(PyUnicodeObject *self,
3831 int (*fixfct)(PyUnicodeObject *s))
3832{
3833
3834 PyUnicodeObject *u;
3835
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003836 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 if (u == NULL)
3838 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003839
3840 Py_UNICODE_COPY(u->str, self->str, self->length);
3841
Tim Peters7a29bd52001-09-12 03:03:31 +00003842 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843 /* fixfct should return TRUE if it modified the buffer. If
3844 FALSE, return a reference to the original buffer instead
3845 (to save space, not time) */
3846 Py_INCREF(self);
3847 Py_DECREF(u);
3848 return (PyObject*) self;
3849 }
3850 return (PyObject*) u;
3851}
3852
Tim Petersced69f82003-09-16 20:30:58 +00003853static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854int fixupper(PyUnicodeObject *self)
3855{
3856 int len = self->length;
3857 Py_UNICODE *s = self->str;
3858 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003859
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860 while (len-- > 0) {
3861 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003862
Guido van Rossumd57fd912000-03-10 22:53:23 +00003863 ch = Py_UNICODE_TOUPPER(*s);
3864 if (ch != *s) {
3865 status = 1;
3866 *s = ch;
3867 }
3868 s++;
3869 }
3870
3871 return status;
3872}
3873
Tim Petersced69f82003-09-16 20:30:58 +00003874static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875int fixlower(PyUnicodeObject *self)
3876{
3877 int len = self->length;
3878 Py_UNICODE *s = self->str;
3879 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003880
Guido van Rossumd57fd912000-03-10 22:53:23 +00003881 while (len-- > 0) {
3882 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003883
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884 ch = Py_UNICODE_TOLOWER(*s);
3885 if (ch != *s) {
3886 status = 1;
3887 *s = ch;
3888 }
3889 s++;
3890 }
3891
3892 return status;
3893}
3894
Tim Petersced69f82003-09-16 20:30:58 +00003895static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896int fixswapcase(PyUnicodeObject *self)
3897{
3898 int len = self->length;
3899 Py_UNICODE *s = self->str;
3900 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003901
Guido van Rossumd57fd912000-03-10 22:53:23 +00003902 while (len-- > 0) {
3903 if (Py_UNICODE_ISUPPER(*s)) {
3904 *s = Py_UNICODE_TOLOWER(*s);
3905 status = 1;
3906 } else if (Py_UNICODE_ISLOWER(*s)) {
3907 *s = Py_UNICODE_TOUPPER(*s);
3908 status = 1;
3909 }
3910 s++;
3911 }
3912
3913 return status;
3914}
3915
Tim Petersced69f82003-09-16 20:30:58 +00003916static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917int fixcapitalize(PyUnicodeObject *self)
3918{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003919 int len = self->length;
3920 Py_UNICODE *s = self->str;
3921 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003922
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003923 if (len == 0)
3924 return 0;
3925 if (Py_UNICODE_ISLOWER(*s)) {
3926 *s = Py_UNICODE_TOUPPER(*s);
3927 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003929 s++;
3930 while (--len > 0) {
3931 if (Py_UNICODE_ISUPPER(*s)) {
3932 *s = Py_UNICODE_TOLOWER(*s);
3933 status = 1;
3934 }
3935 s++;
3936 }
3937 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938}
3939
3940static
3941int fixtitle(PyUnicodeObject *self)
3942{
3943 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3944 register Py_UNICODE *e;
3945 int previous_is_cased;
3946
3947 /* Shortcut for single character strings */
3948 if (PyUnicode_GET_SIZE(self) == 1) {
3949 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3950 if (*p != ch) {
3951 *p = ch;
3952 return 1;
3953 }
3954 else
3955 return 0;
3956 }
Tim Petersced69f82003-09-16 20:30:58 +00003957
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 e = p + PyUnicode_GET_SIZE(self);
3959 previous_is_cased = 0;
3960 for (; p < e; p++) {
3961 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00003962
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963 if (previous_is_cased)
3964 *p = Py_UNICODE_TOLOWER(ch);
3965 else
3966 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00003967
3968 if (Py_UNICODE_ISLOWER(ch) ||
3969 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970 Py_UNICODE_ISTITLE(ch))
3971 previous_is_cased = 1;
3972 else
3973 previous_is_cased = 0;
3974 }
3975 return 1;
3976}
3977
Tim Peters8ce9f162004-08-27 01:49:32 +00003978PyObject *
3979PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980{
Tim Peters8ce9f162004-08-27 01:49:32 +00003981 PyObject *internal_separator = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982 Py_UNICODE *sep;
Tim Peters8ce9f162004-08-27 01:49:32 +00003983 size_t seplen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984 PyUnicodeObject *res = NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00003985 size_t sz; /* # allocated bytes for string in res */
3986 size_t reslen; /* # used bytes */
3987 Py_UNICODE *p; /* pointer to free byte in res's string area */
3988 PyObject *it; /* iterator */
3989 PyObject *item;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003990 int i;
Tim Peters8ce9f162004-08-27 01:49:32 +00003991 PyObject *temp;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992
Tim Peters2cfe3682001-05-05 05:36:48 +00003993 it = PyObject_GetIter(seq);
3994 if (it == NULL)
3995 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996
Tim Peters8ce9f162004-08-27 01:49:32 +00003997 item = PyIter_Next(it);
3998 if (item == NULL) {
3999 if (PyErr_Occurred())
4000 goto onError;
4001 /* empty sequence; return u"" */
4002 res = _PyUnicode_New(0);
4003 goto Done;
4004 }
4005
4006 /* If this is the only item, maybe we can get out cheap. */
4007 res = (PyUnicodeObject *)item;
4008 item = PyIter_Next(it);
4009 if (item == NULL) {
4010 if (PyErr_Occurred())
4011 goto onError;
4012 /* There's only one item in the sequence. */
4013 if (PyUnicode_CheckExact(res)) /* whatever.join([u]) -> u */
4014 goto Done;
4015 }
4016
4017 /* There are at least two to join (item != NULL), or there's only
4018 * one but it's not an exact Unicode (item == NULL). res needs
4019 * conversion to Unicode in either case.
4020 * Caution: we may need to ensure a copy is made, and that's trickier
4021 * than it sounds because, e.g., PyUnicode_FromObject() may return
4022 * a shared object (which must not be mutated).
4023 */
4024 if (! PyUnicode_Check(res) && ! PyString_Check(res)) {
4025 PyErr_Format(PyExc_TypeError,
4026 "sequence item 0: expected string or Unicode,"
4027 " %.80s found",
4028 res->ob_type->tp_name);
4029 Py_XDECREF(item);
4030 goto onError;
4031 }
4032 temp = PyUnicode_FromObject((PyObject *)res);
4033 if (temp == NULL) {
4034 Py_XDECREF(item);
4035 goto onError;
4036 }
4037 Py_DECREF(res);
4038 if (item == NULL) {
4039 /* res was the only item */
4040 res = (PyUnicodeObject *)temp;
4041 goto Done;
4042 }
4043 /* There are at least two items. As above, temp may be a shared object,
4044 * so we need to copy it.
4045 */
4046 reslen = PyUnicode_GET_SIZE(temp);
4047 sz = reslen + 100; /* breathing room */
4048 if (sz < reslen || sz > INT_MAX) /* overflow -- no breathing room */
4049 sz = reslen;
Tim Peters894c5122004-08-27 05:08:36 +00004050 res = _PyUnicode_New((int)sz);
Tim Peters8ce9f162004-08-27 01:49:32 +00004051 if (res == NULL) {
4052 Py_DECREF(item);
4053 goto onError;
4054 }
4055 p = PyUnicode_AS_UNICODE(res);
4056 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(temp), (int)reslen);
4057 p += reslen;
4058 Py_DECREF(temp);
4059
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 if (separator == NULL) {
4061 Py_UNICODE blank = ' ';
4062 sep = &blank;
4063 seplen = 1;
4064 }
4065 else {
Tim Peters8ce9f162004-08-27 01:49:32 +00004066 internal_separator = PyUnicode_FromObject(separator);
4067 if (internal_separator == NULL) {
4068 Py_DECREF(item);
Tim Peters2cfe3682001-05-05 05:36:48 +00004069 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004070 }
4071 sep = PyUnicode_AS_UNICODE(internal_separator);
4072 seplen = PyUnicode_GET_SIZE(internal_separator);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 }
Tim Petersced69f82003-09-16 20:30:58 +00004074
Tim Peters8ce9f162004-08-27 01:49:32 +00004075 i = 1;
4076 do {
4077 size_t itemlen;
4078 size_t newreslen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004079
Tim Peters8ce9f162004-08-27 01:49:32 +00004080 /* Catenate the separator, then item. */
4081 /* First convert item to Unicode. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 if (!PyUnicode_Check(item)) {
4083 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004084 if (!PyString_Check(item)) {
4085 PyErr_Format(PyExc_TypeError,
4086 "sequence item %i: expected string or Unicode,"
4087 " %.80s found",
4088 i, item->ob_type->tp_name);
4089 Py_DECREF(item);
4090 goto onError;
4091 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092 v = PyUnicode_FromObject(item);
4093 Py_DECREF(item);
4094 item = v;
4095 if (item == NULL)
4096 goto onError;
4097 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004098 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters8ce9f162004-08-27 01:49:32 +00004100 newreslen = reslen + seplen + itemlen;
4101 if (newreslen < reslen || newreslen > INT_MAX)
4102 goto Overflow;
4103 if (newreslen > sz) {
4104 do {
4105 size_t oldsize = sz;
4106 sz += sz;
4107 if (sz < oldsize || sz > INT_MAX)
4108 goto Overflow;
4109 } while (newreslen > sz);
4110 if (_PyUnicode_Resize(&res, (int)sz) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004111 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004112 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004113 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004114 p = PyUnicode_AS_UNICODE(res) + reslen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004116 Py_UNICODE_COPY(p, sep, (int)seplen);
4117 p += seplen;
4118 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), (int)itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119 p += itemlen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120 Py_DECREF(item);
Tim Peters8ce9f162004-08-27 01:49:32 +00004121 reslen = newreslen;
4122
4123 ++i;
4124 item = PyIter_Next(it);
4125 } while (item != NULL);
4126 if (PyErr_Occurred())
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127 goto onError;
4128
Tim Peters8ce9f162004-08-27 01:49:32 +00004129 if (_PyUnicode_Resize(&res, (int)reslen) < 0)
4130 goto onError;
4131
4132 Done:
4133 Py_XDECREF(internal_separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004134 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135 return (PyObject *)res;
4136
Tim Peters8ce9f162004-08-27 01:49:32 +00004137 Overflow:
4138 PyErr_SetString(PyExc_OverflowError,
4139 "join() is too long for a Python string");
4140 Py_DECREF(item);
4141 /* fall through */
4142
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004144 Py_XDECREF(internal_separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004145 Py_DECREF(it);
Tim Peters8ce9f162004-08-27 01:49:32 +00004146 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147 return NULL;
4148}
4149
Tim Petersced69f82003-09-16 20:30:58 +00004150static
4151PyUnicodeObject *pad(PyUnicodeObject *self,
4152 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153 int right,
4154 Py_UNICODE fill)
4155{
4156 PyUnicodeObject *u;
4157
4158 if (left < 0)
4159 left = 0;
4160 if (right < 0)
4161 right = 0;
4162
Tim Peters7a29bd52001-09-12 03:03:31 +00004163 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004164 Py_INCREF(self);
4165 return self;
4166 }
4167
4168 u = _PyUnicode_New(left + self->length + right);
4169 if (u) {
4170 if (left)
4171 Py_UNICODE_FILL(u->str, fill, left);
4172 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4173 if (right)
4174 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4175 }
4176
4177 return u;
4178}
4179
4180#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004181 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182 if (!str) \
4183 goto onError; \
4184 if (PyList_Append(list, str)) { \
4185 Py_DECREF(str); \
4186 goto onError; \
4187 } \
4188 else \
4189 Py_DECREF(str);
4190
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004191#define SPLIT_INSERT(data, left, right) \
4192 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4193 if (!str) \
4194 goto onError; \
4195 if (PyList_Insert(list, 0, str)) { \
4196 Py_DECREF(str); \
4197 goto onError; \
4198 } \
4199 else \
4200 Py_DECREF(str);
4201
Guido van Rossumd57fd912000-03-10 22:53:23 +00004202static
4203PyObject *split_whitespace(PyUnicodeObject *self,
4204 PyObject *list,
4205 int maxcount)
4206{
4207 register int i;
4208 register int j;
4209 int len = self->length;
4210 PyObject *str;
4211
4212 for (i = j = 0; i < len; ) {
4213 /* find a token */
4214 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4215 i++;
4216 j = i;
4217 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4218 i++;
4219 if (j < i) {
4220 if (maxcount-- <= 0)
4221 break;
4222 SPLIT_APPEND(self->str, j, i);
4223 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4224 i++;
4225 j = i;
4226 }
4227 }
4228 if (j < len) {
4229 SPLIT_APPEND(self->str, j, len);
4230 }
4231 return list;
4232
4233 onError:
4234 Py_DECREF(list);
4235 return NULL;
4236}
4237
4238PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004239 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004240{
4241 register int i;
4242 register int j;
4243 int len;
4244 PyObject *list;
4245 PyObject *str;
4246 Py_UNICODE *data;
4247
4248 string = PyUnicode_FromObject(string);
4249 if (string == NULL)
4250 return NULL;
4251 data = PyUnicode_AS_UNICODE(string);
4252 len = PyUnicode_GET_SIZE(string);
4253
Guido van Rossumd57fd912000-03-10 22:53:23 +00004254 list = PyList_New(0);
4255 if (!list)
4256 goto onError;
4257
4258 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004259 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004260
Guido van Rossumd57fd912000-03-10 22:53:23 +00004261 /* Find a line and append it */
4262 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4263 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004264
4265 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004266 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004267 if (i < len) {
4268 if (data[i] == '\r' && i + 1 < len &&
4269 data[i+1] == '\n')
4270 i += 2;
4271 else
4272 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004273 if (keepends)
4274 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004275 }
Guido van Rossum86662912000-04-11 15:38:46 +00004276 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277 j = i;
4278 }
4279 if (j < len) {
4280 SPLIT_APPEND(data, j, len);
4281 }
4282
4283 Py_DECREF(string);
4284 return list;
4285
4286 onError:
4287 Py_DECREF(list);
4288 Py_DECREF(string);
4289 return NULL;
4290}
4291
Tim Petersced69f82003-09-16 20:30:58 +00004292static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293PyObject *split_char(PyUnicodeObject *self,
4294 PyObject *list,
4295 Py_UNICODE ch,
4296 int maxcount)
4297{
4298 register int i;
4299 register int j;
4300 int len = self->length;
4301 PyObject *str;
4302
4303 for (i = j = 0; i < len; ) {
4304 if (self->str[i] == ch) {
4305 if (maxcount-- <= 0)
4306 break;
4307 SPLIT_APPEND(self->str, j, i);
4308 i = j = i + 1;
4309 } else
4310 i++;
4311 }
4312 if (j <= len) {
4313 SPLIT_APPEND(self->str, j, len);
4314 }
4315 return list;
4316
4317 onError:
4318 Py_DECREF(list);
4319 return NULL;
4320}
4321
Tim Petersced69f82003-09-16 20:30:58 +00004322static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323PyObject *split_substring(PyUnicodeObject *self,
4324 PyObject *list,
4325 PyUnicodeObject *substring,
4326 int maxcount)
4327{
4328 register int i;
4329 register int j;
4330 int len = self->length;
4331 int sublen = substring->length;
4332 PyObject *str;
4333
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004334 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004335 if (Py_UNICODE_MATCH(self, i, substring)) {
4336 if (maxcount-- <= 0)
4337 break;
4338 SPLIT_APPEND(self->str, j, i);
4339 i = j = i + sublen;
4340 } else
4341 i++;
4342 }
4343 if (j <= len) {
4344 SPLIT_APPEND(self->str, j, len);
4345 }
4346 return list;
4347
4348 onError:
4349 Py_DECREF(list);
4350 return NULL;
4351}
4352
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004353static
4354PyObject *rsplit_whitespace(PyUnicodeObject *self,
4355 PyObject *list,
4356 int maxcount)
4357{
4358 register int i;
4359 register int j;
4360 int len = self->length;
4361 PyObject *str;
4362
4363 for (i = j = len - 1; i >= 0; ) {
4364 /* find a token */
4365 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4366 i--;
4367 j = i;
4368 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4369 i--;
4370 if (j > i) {
4371 if (maxcount-- <= 0)
4372 break;
4373 SPLIT_INSERT(self->str, i + 1, j + 1);
4374 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4375 i--;
4376 j = i;
4377 }
4378 }
4379 if (j >= 0) {
4380 SPLIT_INSERT(self->str, 0, j + 1);
4381 }
4382 return list;
4383
4384 onError:
4385 Py_DECREF(list);
4386 return NULL;
4387}
4388
4389static
4390PyObject *rsplit_char(PyUnicodeObject *self,
4391 PyObject *list,
4392 Py_UNICODE ch,
4393 int maxcount)
4394{
4395 register int i;
4396 register int j;
4397 int len = self->length;
4398 PyObject *str;
4399
4400 for (i = j = len - 1; i >= 0; ) {
4401 if (self->str[i] == ch) {
4402 if (maxcount-- <= 0)
4403 break;
4404 SPLIT_INSERT(self->str, i + 1, j + 1);
4405 j = i = i - 1;
4406 } else
4407 i--;
4408 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004409 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004410 SPLIT_INSERT(self->str, 0, j + 1);
4411 }
4412 return list;
4413
4414 onError:
4415 Py_DECREF(list);
4416 return NULL;
4417}
4418
4419static
4420PyObject *rsplit_substring(PyUnicodeObject *self,
4421 PyObject *list,
4422 PyUnicodeObject *substring,
4423 int maxcount)
4424{
4425 register int i;
4426 register int j;
4427 int len = self->length;
4428 int sublen = substring->length;
4429 PyObject *str;
4430
4431 for (i = len - sublen, j = len; i >= 0; ) {
4432 if (Py_UNICODE_MATCH(self, i, substring)) {
4433 if (maxcount-- <= 0)
4434 break;
4435 SPLIT_INSERT(self->str, i + sublen, j);
4436 j = i;
4437 i -= sublen;
4438 } else
4439 i--;
4440 }
4441 if (j >= 0) {
4442 SPLIT_INSERT(self->str, 0, j);
4443 }
4444 return list;
4445
4446 onError:
4447 Py_DECREF(list);
4448 return NULL;
4449}
4450
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004452#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453
4454static
4455PyObject *split(PyUnicodeObject *self,
4456 PyUnicodeObject *substring,
4457 int maxcount)
4458{
4459 PyObject *list;
4460
4461 if (maxcount < 0)
4462 maxcount = INT_MAX;
4463
4464 list = PyList_New(0);
4465 if (!list)
4466 return NULL;
4467
4468 if (substring == NULL)
4469 return split_whitespace(self,list,maxcount);
4470
4471 else if (substring->length == 1)
4472 return split_char(self,list,substring->str[0],maxcount);
4473
4474 else if (substring->length == 0) {
4475 Py_DECREF(list);
4476 PyErr_SetString(PyExc_ValueError, "empty separator");
4477 return NULL;
4478 }
4479 else
4480 return split_substring(self,list,substring,maxcount);
4481}
4482
Tim Petersced69f82003-09-16 20:30:58 +00004483static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004484PyObject *rsplit(PyUnicodeObject *self,
4485 PyUnicodeObject *substring,
4486 int maxcount)
4487{
4488 PyObject *list;
4489
4490 if (maxcount < 0)
4491 maxcount = INT_MAX;
4492
4493 list = PyList_New(0);
4494 if (!list)
4495 return NULL;
4496
4497 if (substring == NULL)
4498 return rsplit_whitespace(self,list,maxcount);
4499
4500 else if (substring->length == 1)
4501 return rsplit_char(self,list,substring->str[0],maxcount);
4502
4503 else if (substring->length == 0) {
4504 Py_DECREF(list);
4505 PyErr_SetString(PyExc_ValueError, "empty separator");
4506 return NULL;
4507 }
4508 else
4509 return rsplit_substring(self,list,substring,maxcount);
4510}
4511
4512static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513PyObject *replace(PyUnicodeObject *self,
4514 PyUnicodeObject *str1,
4515 PyUnicodeObject *str2,
4516 int maxcount)
4517{
4518 PyUnicodeObject *u;
4519
4520 if (maxcount < 0)
4521 maxcount = INT_MAX;
4522
4523 if (str1->length == 1 && str2->length == 1) {
4524 int i;
4525
4526 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004527 if (!findchar(self->str, self->length, str1->str[0]) &&
4528 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004529 /* nothing to replace, return original string */
4530 Py_INCREF(self);
4531 u = self;
4532 } else {
4533 Py_UNICODE u1 = str1->str[0];
4534 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004535
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004537 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004538 self->length
4539 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004540 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004541 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004542 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543 for (i = 0; i < u->length; i++)
4544 if (u->str[i] == u1) {
4545 if (--maxcount < 0)
4546 break;
4547 u->str[i] = u2;
4548 }
4549 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004551
4552 } else {
4553 int n, i;
4554 Py_UNICODE *p;
4555
4556 /* replace strings */
4557 n = count(self, 0, self->length, str1);
4558 if (n > maxcount)
4559 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004560 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004562 if (PyUnicode_CheckExact(self)) {
4563 Py_INCREF(self);
4564 u = self;
4565 }
4566 else {
4567 u = (PyUnicodeObject *)
4568 PyUnicode_FromUnicode(self->str, self->length);
4569 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570 } else {
4571 u = _PyUnicode_New(
4572 self->length + n * (str2->length - str1->length));
4573 if (u) {
4574 i = 0;
4575 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004576 if (str1->length > 0) {
4577 while (i <= self->length - str1->length)
4578 if (Py_UNICODE_MATCH(self, i, str1)) {
4579 /* replace string segment */
4580 Py_UNICODE_COPY(p, str2->str, str2->length);
4581 p += str2->length;
4582 i += str1->length;
4583 if (--n <= 0) {
4584 /* copy remaining part */
4585 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4586 break;
4587 }
4588 } else
4589 *p++ = self->str[i++];
4590 } else {
4591 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592 Py_UNICODE_COPY(p, str2->str, str2->length);
4593 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004594 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004597 }
4598 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004600 }
4601 }
4602 }
Tim Petersced69f82003-09-16 20:30:58 +00004603
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604 return (PyObject *) u;
4605}
4606
4607/* --- Unicode Object Methods --------------------------------------------- */
4608
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004609PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004610"S.title() -> unicode\n\
4611\n\
4612Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004613characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004614
4615static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004616unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004617{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618 return fixup(self, fixtitle);
4619}
4620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004621PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622"S.capitalize() -> unicode\n\
4623\n\
4624Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004625have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626
4627static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004628unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004629{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630 return fixup(self, fixcapitalize);
4631}
4632
4633#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004634PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635"S.capwords() -> unicode\n\
4636\n\
4637Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004638normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004639
4640static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004641unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004642{
4643 PyObject *list;
4644 PyObject *item;
4645 int i;
4646
Guido van Rossumd57fd912000-03-10 22:53:23 +00004647 /* Split into words */
4648 list = split(self, NULL, -1);
4649 if (!list)
4650 return NULL;
4651
4652 /* Capitalize each word */
4653 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4654 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4655 fixcapitalize);
4656 if (item == NULL)
4657 goto onError;
4658 Py_DECREF(PyList_GET_ITEM(list, i));
4659 PyList_SET_ITEM(list, i, item);
4660 }
4661
4662 /* Join the words to form a new string */
4663 item = PyUnicode_Join(NULL, list);
4664
4665onError:
4666 Py_DECREF(list);
4667 return (PyObject *)item;
4668}
4669#endif
4670
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004671/* Argument converter. Coerces to a single unicode character */
4672
4673static int
4674convert_uc(PyObject *obj, void *addr)
4675{
4676 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4677 PyObject *uniobj;
4678 Py_UNICODE *unistr;
4679
4680 uniobj = PyUnicode_FromObject(obj);
4681 if (uniobj == NULL) {
4682 PyErr_SetString(PyExc_TypeError,
4683 "The fill character cannot be converted to Unicode");
4684 return 0;
4685 }
4686 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4687 PyErr_SetString(PyExc_TypeError,
4688 "The fill character must be exactly one character long");
4689 Py_DECREF(uniobj);
4690 return 0;
4691 }
4692 unistr = PyUnicode_AS_UNICODE(uniobj);
4693 *fillcharloc = unistr[0];
4694 Py_DECREF(uniobj);
4695 return 1;
4696}
4697
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004698PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004699"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004700\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004701Return S centered in a Unicode string of length width. Padding is\n\
4702done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703
4704static PyObject *
4705unicode_center(PyUnicodeObject *self, PyObject *args)
4706{
4707 int marg, left;
4708 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004709 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004711 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712 return NULL;
4713
Tim Peters7a29bd52001-09-12 03:03:31 +00004714 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004715 Py_INCREF(self);
4716 return (PyObject*) self;
4717 }
4718
4719 marg = width - self->length;
4720 left = marg / 2 + (marg & width & 1);
4721
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004722 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004723}
4724
Marc-André Lemburge5034372000-08-08 08:04:29 +00004725#if 0
4726
4727/* This code should go into some future Unicode collation support
4728 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004729 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004730
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004731/* speedy UTF-16 code point order comparison */
4732/* gleaned from: */
4733/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4734
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004735static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004736{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004737 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004738 0, 0, 0, 0, 0, 0, 0, 0,
4739 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004740 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004741};
4742
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743static int
4744unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4745{
4746 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004747
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 Py_UNICODE *s1 = str1->str;
4749 Py_UNICODE *s2 = str2->str;
4750
4751 len1 = str1->length;
4752 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004753
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004755 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004756
4757 c1 = *s1++;
4758 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004759
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004760 if (c1 > (1<<11) * 26)
4761 c1 += utf16Fixup[c1>>11];
4762 if (c2 > (1<<11) * 26)
4763 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004764 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004765
4766 if (c1 != c2)
4767 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004768
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004769 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770 }
4771
4772 return (len1 < len2) ? -1 : (len1 != len2);
4773}
4774
Marc-André Lemburge5034372000-08-08 08:04:29 +00004775#else
4776
4777static int
4778unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4779{
4780 register int len1, len2;
4781
4782 Py_UNICODE *s1 = str1->str;
4783 Py_UNICODE *s2 = str2->str;
4784
4785 len1 = str1->length;
4786 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004787
Marc-André Lemburge5034372000-08-08 08:04:29 +00004788 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004789 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004790
Fredrik Lundh45714e92001-06-26 16:39:36 +00004791 c1 = *s1++;
4792 c2 = *s2++;
4793
4794 if (c1 != c2)
4795 return (c1 < c2) ? -1 : 1;
4796
Marc-André Lemburge5034372000-08-08 08:04:29 +00004797 len1--; len2--;
4798 }
4799
4800 return (len1 < len2) ? -1 : (len1 != len2);
4801}
4802
4803#endif
4804
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805int PyUnicode_Compare(PyObject *left,
4806 PyObject *right)
4807{
4808 PyUnicodeObject *u = NULL, *v = NULL;
4809 int result;
4810
4811 /* Coerce the two arguments */
4812 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4813 if (u == NULL)
4814 goto onError;
4815 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4816 if (v == NULL)
4817 goto onError;
4818
Thomas Wouters7e474022000-07-16 12:04:32 +00004819 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820 if (v == u) {
4821 Py_DECREF(u);
4822 Py_DECREF(v);
4823 return 0;
4824 }
4825
4826 result = unicode_compare(u, v);
4827
4828 Py_DECREF(u);
4829 Py_DECREF(v);
4830 return result;
4831
4832onError:
4833 Py_XDECREF(u);
4834 Py_XDECREF(v);
4835 return -1;
4836}
4837
Guido van Rossum403d68b2000-03-13 15:55:09 +00004838int PyUnicode_Contains(PyObject *container,
4839 PyObject *element)
4840{
4841 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004842 int result, size;
4843 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004844
4845 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004846 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004847 if (v == NULL) {
4848 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004849 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004850 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004851 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004852 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004853 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004854 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004855
Barry Warsaw817918c2002-08-06 16:58:21 +00004856 size = PyUnicode_GET_SIZE(v);
4857 rhs = PyUnicode_AS_UNICODE(v);
4858 lhs = PyUnicode_AS_UNICODE(u);
4859
Guido van Rossum403d68b2000-03-13 15:55:09 +00004860 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004861 if (size == 1) {
4862 end = lhs + PyUnicode_GET_SIZE(u);
4863 while (lhs < end) {
4864 if (*lhs++ == *rhs) {
4865 result = 1;
4866 break;
4867 }
4868 }
4869 }
4870 else {
4871 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4872 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004873 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004874 result = 1;
4875 break;
4876 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004877 }
4878 }
4879
4880 Py_DECREF(u);
4881 Py_DECREF(v);
4882 return result;
4883
4884onError:
4885 Py_XDECREF(u);
4886 Py_XDECREF(v);
4887 return -1;
4888}
4889
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890/* Concat to string or Unicode object giving a new Unicode object. */
4891
4892PyObject *PyUnicode_Concat(PyObject *left,
4893 PyObject *right)
4894{
4895 PyUnicodeObject *u = NULL, *v = NULL, *w;
4896
4897 /* Coerce the two arguments */
4898 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4899 if (u == NULL)
4900 goto onError;
4901 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4902 if (v == NULL)
4903 goto onError;
4904
4905 /* Shortcuts */
4906 if (v == unicode_empty) {
4907 Py_DECREF(v);
4908 return (PyObject *)u;
4909 }
4910 if (u == unicode_empty) {
4911 Py_DECREF(u);
4912 return (PyObject *)v;
4913 }
4914
4915 /* Concat the two Unicode strings */
4916 w = _PyUnicode_New(u->length + v->length);
4917 if (w == NULL)
4918 goto onError;
4919 Py_UNICODE_COPY(w->str, u->str, u->length);
4920 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4921
4922 Py_DECREF(u);
4923 Py_DECREF(v);
4924 return (PyObject *)w;
4925
4926onError:
4927 Py_XDECREF(u);
4928 Py_XDECREF(v);
4929 return NULL;
4930}
4931
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004932PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933"S.count(sub[, start[, end]]) -> int\n\
4934\n\
4935Return the number of occurrences of substring sub in Unicode string\n\
4936S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004937interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938
4939static PyObject *
4940unicode_count(PyUnicodeObject *self, PyObject *args)
4941{
4942 PyUnicodeObject *substring;
4943 int start = 0;
4944 int end = INT_MAX;
4945 PyObject *result;
4946
Guido van Rossumb8872e62000-05-09 14:14:27 +00004947 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4948 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949 return NULL;
4950
4951 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4952 (PyObject *)substring);
4953 if (substring == NULL)
4954 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004955
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956 if (start < 0)
4957 start += self->length;
4958 if (start < 0)
4959 start = 0;
4960 if (end > self->length)
4961 end = self->length;
4962 if (end < 0)
4963 end += self->length;
4964 if (end < 0)
4965 end = 0;
4966
4967 result = PyInt_FromLong((long) count(self, start, end, substring));
4968
4969 Py_DECREF(substring);
4970 return result;
4971}
4972
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004973PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004974"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004976Encodes S using the codec registered for encoding. encoding defaults\n\
4977to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004978handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004979a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4980'xmlcharrefreplace' as well as any other name registered with\n\
4981codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004982
4983static PyObject *
4984unicode_encode(PyUnicodeObject *self, PyObject *args)
4985{
4986 char *encoding = NULL;
4987 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004988 PyObject *v;
4989
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4991 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004992 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00004993 if (v == NULL)
4994 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004995 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
4996 PyErr_Format(PyExc_TypeError,
4997 "encoder did not return a string/unicode object "
4998 "(type=%.400s)",
4999 v->ob_type->tp_name);
5000 Py_DECREF(v);
5001 return NULL;
5002 }
5003 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005004
5005 onError:
5006 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005007}
5008
5009PyDoc_STRVAR(decode__doc__,
5010"S.decode([encoding[,errors]]) -> string or unicode\n\
5011\n\
5012Decodes S using the codec registered for encoding. encoding defaults\n\
5013to the default encoding. errors may be given to set a different error\n\
5014handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5015a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5016as well as any other name registerd with codecs.register_error that is\n\
5017able to handle UnicodeDecodeErrors.");
5018
5019static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005020unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005021{
5022 char *encoding = NULL;
5023 char *errors = NULL;
5024 PyObject *v;
5025
5026 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5027 return NULL;
5028 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005029 if (v == NULL)
5030 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005031 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5032 PyErr_Format(PyExc_TypeError,
5033 "decoder did not return a string/unicode object "
5034 "(type=%.400s)",
5035 v->ob_type->tp_name);
5036 Py_DECREF(v);
5037 return NULL;
5038 }
5039 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005040
5041 onError:
5042 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005043}
5044
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005045PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046"S.expandtabs([tabsize]) -> unicode\n\
5047\n\
5048Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005049If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050
5051static PyObject*
5052unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5053{
5054 Py_UNICODE *e;
5055 Py_UNICODE *p;
5056 Py_UNICODE *q;
5057 int i, j;
5058 PyUnicodeObject *u;
5059 int tabsize = 8;
5060
5061 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5062 return NULL;
5063
Thomas Wouters7e474022000-07-16 12:04:32 +00005064 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065 i = j = 0;
5066 e = self->str + self->length;
5067 for (p = self->str; p < e; p++)
5068 if (*p == '\t') {
5069 if (tabsize > 0)
5070 j += tabsize - (j % tabsize);
5071 }
5072 else {
5073 j++;
5074 if (*p == '\n' || *p == '\r') {
5075 i += j;
5076 j = 0;
5077 }
5078 }
5079
5080 /* Second pass: create output string and fill it */
5081 u = _PyUnicode_New(i + j);
5082 if (!u)
5083 return NULL;
5084
5085 j = 0;
5086 q = u->str;
5087
5088 for (p = self->str; p < e; p++)
5089 if (*p == '\t') {
5090 if (tabsize > 0) {
5091 i = tabsize - (j % tabsize);
5092 j += i;
5093 while (i--)
5094 *q++ = ' ';
5095 }
5096 }
5097 else {
5098 j++;
5099 *q++ = *p;
5100 if (*p == '\n' || *p == '\r')
5101 j = 0;
5102 }
5103
5104 return (PyObject*) u;
5105}
5106
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005107PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108"S.find(sub [,start [,end]]) -> int\n\
5109\n\
5110Return the lowest index in S where substring sub is found,\n\
5111such that sub is contained within s[start,end]. Optional\n\
5112arguments start and end are interpreted as in slice notation.\n\
5113\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005114Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115
5116static PyObject *
5117unicode_find(PyUnicodeObject *self, PyObject *args)
5118{
5119 PyUnicodeObject *substring;
5120 int start = 0;
5121 int end = INT_MAX;
5122 PyObject *result;
5123
Guido van Rossumb8872e62000-05-09 14:14:27 +00005124 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5125 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126 return NULL;
5127 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5128 (PyObject *)substring);
5129 if (substring == NULL)
5130 return NULL;
5131
5132 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
5133
5134 Py_DECREF(substring);
5135 return result;
5136}
5137
5138static PyObject *
5139unicode_getitem(PyUnicodeObject *self, int index)
5140{
5141 if (index < 0 || index >= self->length) {
5142 PyErr_SetString(PyExc_IndexError, "string index out of range");
5143 return NULL;
5144 }
5145
5146 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5147}
5148
5149static long
5150unicode_hash(PyUnicodeObject *self)
5151{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005152 /* Since Unicode objects compare equal to their ASCII string
5153 counterparts, they should use the individual character values
5154 as basis for their hash value. This is needed to assure that
5155 strings and Unicode objects behave in the same way as
5156 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157
Fredrik Lundhdde61642000-07-10 18:27:47 +00005158 register int len;
5159 register Py_UNICODE *p;
5160 register long x;
5161
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162 if (self->hash != -1)
5163 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005164 len = PyUnicode_GET_SIZE(self);
5165 p = PyUnicode_AS_UNICODE(self);
5166 x = *p << 7;
5167 while (--len >= 0)
5168 x = (1000003*x) ^ *p++;
5169 x ^= PyUnicode_GET_SIZE(self);
5170 if (x == -1)
5171 x = -2;
5172 self->hash = x;
5173 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174}
5175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005176PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177"S.index(sub [,start [,end]]) -> int\n\
5178\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005179Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180
5181static PyObject *
5182unicode_index(PyUnicodeObject *self, PyObject *args)
5183{
5184 int result;
5185 PyUnicodeObject *substring;
5186 int start = 0;
5187 int end = INT_MAX;
5188
Guido van Rossumb8872e62000-05-09 14:14:27 +00005189 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5190 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005192
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5194 (PyObject *)substring);
5195 if (substring == NULL)
5196 return NULL;
5197
5198 result = findstring(self, substring, start, end, 1);
5199
5200 Py_DECREF(substring);
5201 if (result < 0) {
5202 PyErr_SetString(PyExc_ValueError, "substring not found");
5203 return NULL;
5204 }
5205 return PyInt_FromLong(result);
5206}
5207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005208PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005209"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005211Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005212at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213
5214static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005215unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216{
5217 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5218 register const Py_UNICODE *e;
5219 int cased;
5220
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221 /* Shortcut for single character strings */
5222 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005223 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005225 /* Special case for empty strings */
5226 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005227 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005228
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 e = p + PyUnicode_GET_SIZE(self);
5230 cased = 0;
5231 for (; p < e; p++) {
5232 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005233
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005235 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 else if (!cased && Py_UNICODE_ISLOWER(ch))
5237 cased = 1;
5238 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005239 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240}
5241
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005242PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005243"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005245Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005246at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247
5248static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005249unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250{
5251 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5252 register const Py_UNICODE *e;
5253 int cased;
5254
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255 /* Shortcut for single character strings */
5256 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005257 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005259 /* Special case for empty strings */
5260 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005261 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005262
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263 e = p + PyUnicode_GET_SIZE(self);
5264 cased = 0;
5265 for (; p < e; p++) {
5266 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005267
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005269 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270 else if (!cased && Py_UNICODE_ISUPPER(ch))
5271 cased = 1;
5272 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005273 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274}
5275
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005276PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005277"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005279Return True if S is a titlecased string and there is at least one\n\
5280character in S, i.e. upper- and titlecase characters may only\n\
5281follow uncased characters and lowercase characters only cased ones.\n\
5282Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283
5284static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005285unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286{
5287 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5288 register const Py_UNICODE *e;
5289 int cased, previous_is_cased;
5290
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291 /* Shortcut for single character strings */
5292 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005293 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5294 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005296 /* Special case for empty strings */
5297 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005298 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005299
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300 e = p + PyUnicode_GET_SIZE(self);
5301 cased = 0;
5302 previous_is_cased = 0;
5303 for (; p < e; p++) {
5304 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005305
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5307 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005308 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309 previous_is_cased = 1;
5310 cased = 1;
5311 }
5312 else if (Py_UNICODE_ISLOWER(ch)) {
5313 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005314 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315 previous_is_cased = 1;
5316 cased = 1;
5317 }
5318 else
5319 previous_is_cased = 0;
5320 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005321 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005322}
5323
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005324PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005325"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005327Return True if all characters in S are whitespace\n\
5328and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329
5330static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005331unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332{
5333 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5334 register const Py_UNICODE *e;
5335
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336 /* Shortcut for single character strings */
5337 if (PyUnicode_GET_SIZE(self) == 1 &&
5338 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005339 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005341 /* Special case for empty strings */
5342 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005343 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005344
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345 e = p + PyUnicode_GET_SIZE(self);
5346 for (; p < e; p++) {
5347 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005348 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005350 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351}
5352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005353PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005354"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005355\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005356Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005357and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005358
5359static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005360unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005361{
5362 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5363 register const Py_UNICODE *e;
5364
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005365 /* Shortcut for single character strings */
5366 if (PyUnicode_GET_SIZE(self) == 1 &&
5367 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005368 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005369
5370 /* Special case for empty strings */
5371 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005372 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005373
5374 e = p + PyUnicode_GET_SIZE(self);
5375 for (; p < e; p++) {
5376 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005377 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005378 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005379 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005380}
5381
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005382PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005383"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005384\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005385Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005386and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005387
5388static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005389unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005390{
5391 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5392 register const Py_UNICODE *e;
5393
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005394 /* Shortcut for single character strings */
5395 if (PyUnicode_GET_SIZE(self) == 1 &&
5396 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005397 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005398
5399 /* Special case for empty strings */
5400 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005401 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005402
5403 e = p + PyUnicode_GET_SIZE(self);
5404 for (; p < e; p++) {
5405 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005406 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005407 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005408 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005409}
5410
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005411PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005412"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005414Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005415False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416
5417static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005418unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419{
5420 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5421 register const Py_UNICODE *e;
5422
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423 /* Shortcut for single character strings */
5424 if (PyUnicode_GET_SIZE(self) == 1 &&
5425 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005426 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005428 /* Special case for empty strings */
5429 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005430 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005431
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432 e = p + PyUnicode_GET_SIZE(self);
5433 for (; p < e; p++) {
5434 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005435 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005437 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438}
5439
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005440PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005441"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005443Return True if all characters in S are digits\n\
5444and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445
5446static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005447unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448{
5449 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5450 register const Py_UNICODE *e;
5451
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 /* Shortcut for single character strings */
5453 if (PyUnicode_GET_SIZE(self) == 1 &&
5454 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005455 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005457 /* Special case for empty strings */
5458 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005459 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005460
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 e = p + PyUnicode_GET_SIZE(self);
5462 for (; p < e; p++) {
5463 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005464 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005466 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467}
5468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005469PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005470"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005472Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005473False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474
5475static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005476unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477{
5478 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5479 register const Py_UNICODE *e;
5480
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 /* Shortcut for single character strings */
5482 if (PyUnicode_GET_SIZE(self) == 1 &&
5483 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005484 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005486 /* Special case for empty strings */
5487 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005488 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005489
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490 e = p + PyUnicode_GET_SIZE(self);
5491 for (; p < e; p++) {
5492 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005493 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005495 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496}
5497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005498PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499"S.join(sequence) -> unicode\n\
5500\n\
5501Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005502sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503
5504static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005505unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005507 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508}
5509
5510static int
5511unicode_length(PyUnicodeObject *self)
5512{
5513 return self->length;
5514}
5515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005516PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005517"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518\n\
5519Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005520done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521
5522static PyObject *
5523unicode_ljust(PyUnicodeObject *self, PyObject *args)
5524{
5525 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005526 Py_UNICODE fillchar = ' ';
5527
5528 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529 return NULL;
5530
Tim Peters7a29bd52001-09-12 03:03:31 +00005531 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532 Py_INCREF(self);
5533 return (PyObject*) self;
5534 }
5535
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005536 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537}
5538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005539PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540"S.lower() -> unicode\n\
5541\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005542Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543
5544static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005545unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 return fixup(self, fixlower);
5548}
5549
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005550#define LEFTSTRIP 0
5551#define RIGHTSTRIP 1
5552#define BOTHSTRIP 2
5553
5554/* Arrays indexed by above */
5555static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5556
5557#define STRIPNAME(i) (stripformat[i]+3)
5558
5559static const Py_UNICODE *
5560unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5561{
Tim Peters030a5ce2002-04-22 19:00:10 +00005562 size_t i;
5563 for (i = 0; i < n; ++i)
5564 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005565 return s+i;
5566 return NULL;
5567}
5568
5569/* externally visible for str.strip(unicode) */
5570PyObject *
5571_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5572{
5573 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5574 int len = PyUnicode_GET_SIZE(self);
5575 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5576 int seplen = PyUnicode_GET_SIZE(sepobj);
5577 int i, j;
5578
5579 i = 0;
5580 if (striptype != RIGHTSTRIP) {
5581 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5582 i++;
5583 }
5584 }
5585
5586 j = len;
5587 if (striptype != LEFTSTRIP) {
5588 do {
5589 j--;
5590 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5591 j++;
5592 }
5593
5594 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5595 Py_INCREF(self);
5596 return (PyObject*)self;
5597 }
5598 else
5599 return PyUnicode_FromUnicode(s+i, j-i);
5600}
5601
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602
5603static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005604do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005606 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5607 int len = PyUnicode_GET_SIZE(self), i, j;
5608
5609 i = 0;
5610 if (striptype != RIGHTSTRIP) {
5611 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5612 i++;
5613 }
5614 }
5615
5616 j = len;
5617 if (striptype != LEFTSTRIP) {
5618 do {
5619 j--;
5620 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5621 j++;
5622 }
5623
5624 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5625 Py_INCREF(self);
5626 return (PyObject*)self;
5627 }
5628 else
5629 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630}
5631
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005632
5633static PyObject *
5634do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5635{
5636 PyObject *sep = NULL;
5637
5638 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5639 return NULL;
5640
5641 if (sep != NULL && sep != Py_None) {
5642 if (PyUnicode_Check(sep))
5643 return _PyUnicode_XStrip(self, striptype, sep);
5644 else if (PyString_Check(sep)) {
5645 PyObject *res;
5646 sep = PyUnicode_FromObject(sep);
5647 if (sep==NULL)
5648 return NULL;
5649 res = _PyUnicode_XStrip(self, striptype, sep);
5650 Py_DECREF(sep);
5651 return res;
5652 }
5653 else {
5654 PyErr_Format(PyExc_TypeError,
5655 "%s arg must be None, unicode or str",
5656 STRIPNAME(striptype));
5657 return NULL;
5658 }
5659 }
5660
5661 return do_strip(self, striptype);
5662}
5663
5664
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005665PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005666"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005667\n\
5668Return a copy of the string S with leading and trailing\n\
5669whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005670If chars is given and not None, remove characters in chars instead.\n\
5671If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005672
5673static PyObject *
5674unicode_strip(PyUnicodeObject *self, PyObject *args)
5675{
5676 if (PyTuple_GET_SIZE(args) == 0)
5677 return do_strip(self, BOTHSTRIP); /* Common case */
5678 else
5679 return do_argstrip(self, BOTHSTRIP, args);
5680}
5681
5682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005683PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005684"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005685\n\
5686Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005687If chars is given and not None, remove characters in chars instead.\n\
5688If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005689
5690static PyObject *
5691unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5692{
5693 if (PyTuple_GET_SIZE(args) == 0)
5694 return do_strip(self, LEFTSTRIP); /* Common case */
5695 else
5696 return do_argstrip(self, LEFTSTRIP, args);
5697}
5698
5699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005700PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005701"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005702\n\
5703Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005704If chars is given and not None, remove characters in chars instead.\n\
5705If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005706
5707static PyObject *
5708unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5709{
5710 if (PyTuple_GET_SIZE(args) == 0)
5711 return do_strip(self, RIGHTSTRIP); /* Common case */
5712 else
5713 return do_argstrip(self, RIGHTSTRIP, args);
5714}
5715
5716
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717static PyObject*
5718unicode_repeat(PyUnicodeObject *str, int len)
5719{
5720 PyUnicodeObject *u;
5721 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005722 int nchars;
5723 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724
5725 if (len < 0)
5726 len = 0;
5727
Tim Peters7a29bd52001-09-12 03:03:31 +00005728 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729 /* no repeat, return original string */
5730 Py_INCREF(str);
5731 return (PyObject*) str;
5732 }
Tim Peters8f422462000-09-09 06:13:41 +00005733
5734 /* ensure # of chars needed doesn't overflow int and # of bytes
5735 * needed doesn't overflow size_t
5736 */
5737 nchars = len * str->length;
5738 if (len && nchars / len != str->length) {
5739 PyErr_SetString(PyExc_OverflowError,
5740 "repeated string is too long");
5741 return NULL;
5742 }
5743 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5744 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5745 PyErr_SetString(PyExc_OverflowError,
5746 "repeated string is too long");
5747 return NULL;
5748 }
5749 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750 if (!u)
5751 return NULL;
5752
5753 p = u->str;
5754
5755 while (len-- > 0) {
5756 Py_UNICODE_COPY(p, str->str, str->length);
5757 p += str->length;
5758 }
5759
5760 return (PyObject*) u;
5761}
5762
5763PyObject *PyUnicode_Replace(PyObject *obj,
5764 PyObject *subobj,
5765 PyObject *replobj,
5766 int maxcount)
5767{
5768 PyObject *self;
5769 PyObject *str1;
5770 PyObject *str2;
5771 PyObject *result;
5772
5773 self = PyUnicode_FromObject(obj);
5774 if (self == NULL)
5775 return NULL;
5776 str1 = PyUnicode_FromObject(subobj);
5777 if (str1 == NULL) {
5778 Py_DECREF(self);
5779 return NULL;
5780 }
5781 str2 = PyUnicode_FromObject(replobj);
5782 if (str2 == NULL) {
5783 Py_DECREF(self);
5784 Py_DECREF(str1);
5785 return NULL;
5786 }
Tim Petersced69f82003-09-16 20:30:58 +00005787 result = replace((PyUnicodeObject *)self,
5788 (PyUnicodeObject *)str1,
5789 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 maxcount);
5791 Py_DECREF(self);
5792 Py_DECREF(str1);
5793 Py_DECREF(str2);
5794 return result;
5795}
5796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005797PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798"S.replace (old, new[, maxsplit]) -> unicode\n\
5799\n\
5800Return a copy of S with all occurrences of substring\n\
5801old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005802given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803
5804static PyObject*
5805unicode_replace(PyUnicodeObject *self, PyObject *args)
5806{
5807 PyUnicodeObject *str1;
5808 PyUnicodeObject *str2;
5809 int maxcount = -1;
5810 PyObject *result;
5811
5812 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5813 return NULL;
5814 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5815 if (str1 == NULL)
5816 return NULL;
5817 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005818 if (str2 == NULL) {
5819 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005821 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822
5823 result = replace(self, str1, str2, maxcount);
5824
5825 Py_DECREF(str1);
5826 Py_DECREF(str2);
5827 return result;
5828}
5829
5830static
5831PyObject *unicode_repr(PyObject *unicode)
5832{
5833 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5834 PyUnicode_GET_SIZE(unicode),
5835 1);
5836}
5837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005838PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839"S.rfind(sub [,start [,end]]) -> int\n\
5840\n\
5841Return the highest index in S where substring sub is found,\n\
5842such that sub is contained within s[start,end]. Optional\n\
5843arguments start and end are interpreted as in slice notation.\n\
5844\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005845Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846
5847static PyObject *
5848unicode_rfind(PyUnicodeObject *self, PyObject *args)
5849{
5850 PyUnicodeObject *substring;
5851 int start = 0;
5852 int end = INT_MAX;
5853 PyObject *result;
5854
Guido van Rossumb8872e62000-05-09 14:14:27 +00005855 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5856 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857 return NULL;
5858 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5859 (PyObject *)substring);
5860 if (substring == NULL)
5861 return NULL;
5862
5863 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5864
5865 Py_DECREF(substring);
5866 return result;
5867}
5868
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005869PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870"S.rindex(sub [,start [,end]]) -> int\n\
5871\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005872Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873
5874static PyObject *
5875unicode_rindex(PyUnicodeObject *self, PyObject *args)
5876{
5877 int result;
5878 PyUnicodeObject *substring;
5879 int start = 0;
5880 int end = INT_MAX;
5881
Guido van Rossumb8872e62000-05-09 14:14:27 +00005882 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5883 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 return NULL;
5885 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5886 (PyObject *)substring);
5887 if (substring == NULL)
5888 return NULL;
5889
5890 result = findstring(self, substring, start, end, -1);
5891
5892 Py_DECREF(substring);
5893 if (result < 0) {
5894 PyErr_SetString(PyExc_ValueError, "substring not found");
5895 return NULL;
5896 }
5897 return PyInt_FromLong(result);
5898}
5899
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005900PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005901"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902\n\
5903Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005904done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905
5906static PyObject *
5907unicode_rjust(PyUnicodeObject *self, PyObject *args)
5908{
5909 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005910 Py_UNICODE fillchar = ' ';
5911
5912 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 return NULL;
5914
Tim Peters7a29bd52001-09-12 03:03:31 +00005915 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 Py_INCREF(self);
5917 return (PyObject*) self;
5918 }
5919
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005920 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921}
5922
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923static PyObject*
5924unicode_slice(PyUnicodeObject *self, int start, int end)
5925{
5926 /* standard clamping */
5927 if (start < 0)
5928 start = 0;
5929 if (end < 0)
5930 end = 0;
5931 if (end > self->length)
5932 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005933 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 /* full slice, return original string */
5935 Py_INCREF(self);
5936 return (PyObject*) self;
5937 }
5938 if (start > end)
5939 start = end;
5940 /* copy slice */
5941 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5942 end - start);
5943}
5944
5945PyObject *PyUnicode_Split(PyObject *s,
5946 PyObject *sep,
5947 int maxsplit)
5948{
5949 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005950
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951 s = PyUnicode_FromObject(s);
5952 if (s == NULL)
5953 return NULL;
5954 if (sep != NULL) {
5955 sep = PyUnicode_FromObject(sep);
5956 if (sep == NULL) {
5957 Py_DECREF(s);
5958 return NULL;
5959 }
5960 }
5961
5962 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5963
5964 Py_DECREF(s);
5965 Py_XDECREF(sep);
5966 return result;
5967}
5968
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005969PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970"S.split([sep [,maxsplit]]) -> list of strings\n\
5971\n\
5972Return a list of the words in S, using sep as the\n\
5973delimiter string. If maxsplit is given, at most maxsplit\n\
5974splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005975is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976
5977static PyObject*
5978unicode_split(PyUnicodeObject *self, PyObject *args)
5979{
5980 PyObject *substring = Py_None;
5981 int maxcount = -1;
5982
5983 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5984 return NULL;
5985
5986 if (substring == Py_None)
5987 return split(self, NULL, maxcount);
5988 else if (PyUnicode_Check(substring))
5989 return split(self, (PyUnicodeObject *)substring, maxcount);
5990 else
5991 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5992}
5993
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005994PyObject *PyUnicode_RSplit(PyObject *s,
5995 PyObject *sep,
5996 int maxsplit)
5997{
5998 PyObject *result;
5999
6000 s = PyUnicode_FromObject(s);
6001 if (s == NULL)
6002 return NULL;
6003 if (sep != NULL) {
6004 sep = PyUnicode_FromObject(sep);
6005 if (sep == NULL) {
6006 Py_DECREF(s);
6007 return NULL;
6008 }
6009 }
6010
6011 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6012
6013 Py_DECREF(s);
6014 Py_XDECREF(sep);
6015 return result;
6016}
6017
6018PyDoc_STRVAR(rsplit__doc__,
6019"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6020\n\
6021Return a list of the words in S, using sep as the\n\
6022delimiter string, starting at the end of the string and\n\
6023working to the front. If maxsplit is given, at most maxsplit\n\
6024splits are done. If sep is not specified, any whitespace string\n\
6025is a separator.");
6026
6027static PyObject*
6028unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6029{
6030 PyObject *substring = Py_None;
6031 int maxcount = -1;
6032
6033 if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
6034 return NULL;
6035
6036 if (substring == Py_None)
6037 return rsplit(self, NULL, maxcount);
6038 else if (PyUnicode_Check(substring))
6039 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6040 else
6041 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6042}
6043
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006044PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006045"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046\n\
6047Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006048Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006049is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050
6051static PyObject*
6052unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6053{
Guido van Rossum86662912000-04-11 15:38:46 +00006054 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055
Guido van Rossum86662912000-04-11 15:38:46 +00006056 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 return NULL;
6058
Guido van Rossum86662912000-04-11 15:38:46 +00006059 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060}
6061
6062static
6063PyObject *unicode_str(PyUnicodeObject *self)
6064{
Fred Drakee4315f52000-05-09 19:53:39 +00006065 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066}
6067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006068PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069"S.swapcase() -> unicode\n\
6070\n\
6071Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006072and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073
6074static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006075unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 return fixup(self, fixswapcase);
6078}
6079
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006080PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081"S.translate(table) -> unicode\n\
6082\n\
6083Return a copy of the string S, where all characters have been mapped\n\
6084through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006085Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6086Unmapped characters are left untouched. Characters mapped to None\n\
6087are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088
6089static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006090unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091{
Tim Petersced69f82003-09-16 20:30:58 +00006092 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006094 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 "ignore");
6096}
6097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006098PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099"S.upper() -> unicode\n\
6100\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006101Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102
6103static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006104unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106 return fixup(self, fixupper);
6107}
6108
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006109PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110"S.zfill(width) -> unicode\n\
6111\n\
6112Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006113of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114
6115static PyObject *
6116unicode_zfill(PyUnicodeObject *self, PyObject *args)
6117{
6118 int fill;
6119 PyUnicodeObject *u;
6120
6121 int width;
6122 if (!PyArg_ParseTuple(args, "i:zfill", &width))
6123 return NULL;
6124
6125 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006126 if (PyUnicode_CheckExact(self)) {
6127 Py_INCREF(self);
6128 return (PyObject*) self;
6129 }
6130 else
6131 return PyUnicode_FromUnicode(
6132 PyUnicode_AS_UNICODE(self),
6133 PyUnicode_GET_SIZE(self)
6134 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135 }
6136
6137 fill = width - self->length;
6138
6139 u = pad(self, fill, 0, '0');
6140
Walter Dörwald068325e2002-04-15 13:36:47 +00006141 if (u == NULL)
6142 return NULL;
6143
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 if (u->str[fill] == '+' || u->str[fill] == '-') {
6145 /* move sign to beginning of string */
6146 u->str[0] = u->str[fill];
6147 u->str[fill] = '0';
6148 }
6149
6150 return (PyObject*) u;
6151}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152
6153#if 0
6154static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006155unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 return PyInt_FromLong(unicode_freelist_size);
6158}
6159#endif
6160
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006161PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006162"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006164Return True if S starts with the specified prefix, False otherwise.\n\
6165With optional start, test S beginning at that position.\n\
6166With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167
6168static PyObject *
6169unicode_startswith(PyUnicodeObject *self,
6170 PyObject *args)
6171{
6172 PyUnicodeObject *substring;
6173 int start = 0;
6174 int end = INT_MAX;
6175 PyObject *result;
6176
Guido van Rossumb8872e62000-05-09 14:14:27 +00006177 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6178 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 return NULL;
6180 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6181 (PyObject *)substring);
6182 if (substring == NULL)
6183 return NULL;
6184
Guido van Rossum77f6a652002-04-03 22:41:51 +00006185 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186
6187 Py_DECREF(substring);
6188 return result;
6189}
6190
6191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006192PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006193"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006195Return True if S ends with the specified suffix, False otherwise.\n\
6196With optional start, test S beginning at that position.\n\
6197With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198
6199static PyObject *
6200unicode_endswith(PyUnicodeObject *self,
6201 PyObject *args)
6202{
6203 PyUnicodeObject *substring;
6204 int start = 0;
6205 int end = INT_MAX;
6206 PyObject *result;
6207
Guido van Rossumb8872e62000-05-09 14:14:27 +00006208 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6209 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210 return NULL;
6211 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6212 (PyObject *)substring);
6213 if (substring == NULL)
6214 return NULL;
6215
Guido van Rossum77f6a652002-04-03 22:41:51 +00006216 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217
6218 Py_DECREF(substring);
6219 return result;
6220}
6221
6222
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006223
6224static PyObject *
6225unicode_getnewargs(PyUnicodeObject *v)
6226{
6227 return Py_BuildValue("(u#)", v->str, v->length);
6228}
6229
6230
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231static PyMethodDef unicode_methods[] = {
6232
6233 /* Order is according to common usage: often used methods should
6234 appear first, since lookup is done sequentially. */
6235
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006236 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6237 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6238 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006239 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006240 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6241 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6242 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6243 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6244 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6245 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6246 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6247 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6248 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6249 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006250 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006251 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006252/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6253 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6254 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6255 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006256 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006257 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006258 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006259 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6260 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6261 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6262 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6263 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6264 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6265 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6266 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6267 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6268 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6269 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6270 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6271 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6272 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006273 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006274#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006275 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276#endif
6277
6278#if 0
6279 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006280 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281#endif
6282
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006283 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284 {NULL, NULL}
6285};
6286
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006287static PyObject *
6288unicode_mod(PyObject *v, PyObject *w)
6289{
6290 if (!PyUnicode_Check(v)) {
6291 Py_INCREF(Py_NotImplemented);
6292 return Py_NotImplemented;
6293 }
6294 return PyUnicode_Format(v, w);
6295}
6296
6297static PyNumberMethods unicode_as_number = {
6298 0, /*nb_add*/
6299 0, /*nb_subtract*/
6300 0, /*nb_multiply*/
6301 0, /*nb_divide*/
6302 unicode_mod, /*nb_remainder*/
6303};
6304
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305static PySequenceMethods unicode_as_sequence = {
6306 (inquiry) unicode_length, /* sq_length */
6307 (binaryfunc) PyUnicode_Concat, /* sq_concat */
6308 (intargfunc) unicode_repeat, /* sq_repeat */
6309 (intargfunc) unicode_getitem, /* sq_item */
6310 (intintargfunc) unicode_slice, /* sq_slice */
6311 0, /* sq_ass_item */
6312 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00006313 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314};
6315
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006316static PyObject*
6317unicode_subscript(PyUnicodeObject* self, PyObject* item)
6318{
6319 if (PyInt_Check(item)) {
6320 long i = PyInt_AS_LONG(item);
6321 if (i < 0)
6322 i += PyString_GET_SIZE(self);
6323 return unicode_getitem(self, i);
6324 } else if (PyLong_Check(item)) {
6325 long i = PyLong_AsLong(item);
6326 if (i == -1 && PyErr_Occurred())
6327 return NULL;
6328 if (i < 0)
6329 i += PyString_GET_SIZE(self);
6330 return unicode_getitem(self, i);
6331 } else if (PySlice_Check(item)) {
6332 int start, stop, step, slicelength, cur, i;
6333 Py_UNICODE* source_buf;
6334 Py_UNICODE* result_buf;
6335 PyObject* result;
6336
6337 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
6338 &start, &stop, &step, &slicelength) < 0) {
6339 return NULL;
6340 }
6341
6342 if (slicelength <= 0) {
6343 return PyUnicode_FromUnicode(NULL, 0);
6344 } else {
6345 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6346 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
6347
6348 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6349 result_buf[i] = source_buf[cur];
6350 }
Tim Petersced69f82003-09-16 20:30:58 +00006351
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006352 result = PyUnicode_FromUnicode(result_buf, slicelength);
6353 PyMem_FREE(result_buf);
6354 return result;
6355 }
6356 } else {
6357 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6358 return NULL;
6359 }
6360}
6361
6362static PyMappingMethods unicode_as_mapping = {
6363 (inquiry)unicode_length, /* mp_length */
6364 (binaryfunc)unicode_subscript, /* mp_subscript */
6365 (objobjargproc)0, /* mp_ass_subscript */
6366};
6367
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368static int
6369unicode_buffer_getreadbuf(PyUnicodeObject *self,
6370 int index,
6371 const void **ptr)
6372{
6373 if (index != 0) {
6374 PyErr_SetString(PyExc_SystemError,
6375 "accessing non-existent unicode segment");
6376 return -1;
6377 }
6378 *ptr = (void *) self->str;
6379 return PyUnicode_GET_DATA_SIZE(self);
6380}
6381
6382static int
6383unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6384 const void **ptr)
6385{
6386 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006387 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388 return -1;
6389}
6390
6391static int
6392unicode_buffer_getsegcount(PyUnicodeObject *self,
6393 int *lenp)
6394{
6395 if (lenp)
6396 *lenp = PyUnicode_GET_DATA_SIZE(self);
6397 return 1;
6398}
6399
6400static int
6401unicode_buffer_getcharbuf(PyUnicodeObject *self,
6402 int index,
6403 const void **ptr)
6404{
6405 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006406
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407 if (index != 0) {
6408 PyErr_SetString(PyExc_SystemError,
6409 "accessing non-existent unicode segment");
6410 return -1;
6411 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006412 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 if (str == NULL)
6414 return -1;
6415 *ptr = (void *) PyString_AS_STRING(str);
6416 return PyString_GET_SIZE(str);
6417}
6418
6419/* Helpers for PyUnicode_Format() */
6420
6421static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006422getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423{
6424 int argidx = *p_argidx;
6425 if (argidx < arglen) {
6426 (*p_argidx)++;
6427 if (arglen < 0)
6428 return args;
6429 else
6430 return PyTuple_GetItem(args, argidx);
6431 }
6432 PyErr_SetString(PyExc_TypeError,
6433 "not enough arguments for format string");
6434 return NULL;
6435}
6436
6437#define F_LJUST (1<<0)
6438#define F_SIGN (1<<1)
6439#define F_BLANK (1<<2)
6440#define F_ALT (1<<3)
6441#define F_ZERO (1<<4)
6442
6443static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445{
6446 register int i;
6447 int len;
6448 va_list va;
6449 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451
6452 /* First, format the string as char array, then expand to Py_UNICODE
6453 array. */
6454 charbuffer = (char *)buffer;
6455 len = vsprintf(charbuffer, format, va);
6456 for (i = len - 1; i >= 0; i--)
6457 buffer[i] = (Py_UNICODE) charbuffer[i];
6458
6459 va_end(va);
6460 return len;
6461}
6462
Guido van Rossum078151d2002-08-11 04:24:12 +00006463/* XXX To save some code duplication, formatfloat/long/int could have been
6464 shared with stringobject.c, converting from 8-bit to Unicode after the
6465 formatting is done. */
6466
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467static int
6468formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006469 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 int flags,
6471 int prec,
6472 int type,
6473 PyObject *v)
6474{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006475 /* fmt = '%#.' + `prec` + `type`
6476 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477 char fmt[20];
6478 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006479
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 x = PyFloat_AsDouble(v);
6481 if (x == -1.0 && PyErr_Occurred())
6482 return -1;
6483 if (prec < 0)
6484 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6486 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006487 /* Worst case length calc to ensure no buffer overrun:
6488
6489 'g' formats:
6490 fmt = %#.<prec>g
6491 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6492 for any double rep.)
6493 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6494
6495 'f' formats:
6496 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6497 len = 1 + 50 + 1 + prec = 52 + prec
6498
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006499 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006500 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006501
6502 */
6503 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6504 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006505 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006506 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006507 return -1;
6508 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006509 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6510 (flags&F_ALT) ? "#" : "",
6511 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512 return usprintf(buf, fmt, x);
6513}
6514
Tim Peters38fd5b62000-09-21 05:43:11 +00006515static PyObject*
6516formatlong(PyObject *val, int flags, int prec, int type)
6517{
6518 char *buf;
6519 int i, len;
6520 PyObject *str; /* temporary string object. */
6521 PyUnicodeObject *result;
6522
6523 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6524 if (!str)
6525 return NULL;
6526 result = _PyUnicode_New(len);
6527 for (i = 0; i < len; i++)
6528 result->str[i] = buf[i];
6529 result->str[len] = 0;
6530 Py_DECREF(str);
6531 return (PyObject*)result;
6532}
6533
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534static int
6535formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006536 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537 int flags,
6538 int prec,
6539 int type,
6540 PyObject *v)
6541{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006542 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006543 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6544 * + 1 + 1
6545 * = 24
6546 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006547 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006548 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 long x;
6550
6551 x = PyInt_AsLong(v);
6552 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006553 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006554 if (x < 0 && type == 'u') {
6555 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006556 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006557 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6558 sign = "-";
6559 else
6560 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006562 prec = 1;
6563
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006564 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6565 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006566 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006567 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006568 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006569 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006570 return -1;
6571 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006572
6573 if ((flags & F_ALT) &&
6574 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006575 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006576 * of issues that cause pain:
6577 * - when 0 is being converted, the C standard leaves off
6578 * the '0x' or '0X', which is inconsistent with other
6579 * %#x/%#X conversions and inconsistent with Python's
6580 * hex() function
6581 * - there are platforms that violate the standard and
6582 * convert 0 with the '0x' or '0X'
6583 * (Metrowerks, Compaq Tru64)
6584 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006585 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006586 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006587 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006588 * We can achieve the desired consistency by inserting our
6589 * own '0x' or '0X' prefix, and substituting %x/%X in place
6590 * of %#x/%#X.
6591 *
6592 * Note that this is the same approach as used in
6593 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006594 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006595 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6596 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006597 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006598 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006599 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6600 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006601 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006602 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006603 if (sign[0])
6604 return usprintf(buf, fmt, -x);
6605 else
6606 return usprintf(buf, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607}
6608
6609static int
6610formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006611 size_t buflen,
6612 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006614 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006615 if (PyUnicode_Check(v)) {
6616 if (PyUnicode_GET_SIZE(v) != 1)
6617 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006619 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006621 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006622 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006623 goto onError;
6624 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6625 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626
6627 else {
6628 /* Integer input truncated to a character */
6629 long x;
6630 x = PyInt_AsLong(v);
6631 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006632 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006633#ifdef Py_UNICODE_WIDE
6634 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006635 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006636 "%c arg not in range(0x110000) "
6637 "(wide Python build)");
6638 return -1;
6639 }
6640#else
6641 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006642 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006643 "%c arg not in range(0x10000) "
6644 "(narrow Python build)");
6645 return -1;
6646 }
6647#endif
6648 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649 }
6650 buf[1] = '\0';
6651 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006652
6653 onError:
6654 PyErr_SetString(PyExc_TypeError,
6655 "%c requires int or char");
6656 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657}
6658
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006659/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6660
6661 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6662 chars are formatted. XXX This is a magic number. Each formatting
6663 routine does bounds checking to ensure no overflow, but a better
6664 solution may be to malloc a buffer of appropriate size for each
6665 format. For now, the current solution is sufficient.
6666*/
6667#define FORMATBUFLEN (size_t)120
6668
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669PyObject *PyUnicode_Format(PyObject *format,
6670 PyObject *args)
6671{
6672 Py_UNICODE *fmt, *res;
6673 int fmtcnt, rescnt, reslen, arglen, argidx;
6674 int args_owned = 0;
6675 PyUnicodeObject *result = NULL;
6676 PyObject *dict = NULL;
6677 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006678
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 if (format == NULL || args == NULL) {
6680 PyErr_BadInternalCall();
6681 return NULL;
6682 }
6683 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006684 if (uformat == NULL)
6685 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686 fmt = PyUnicode_AS_UNICODE(uformat);
6687 fmtcnt = PyUnicode_GET_SIZE(uformat);
6688
6689 reslen = rescnt = fmtcnt + 100;
6690 result = _PyUnicode_New(reslen);
6691 if (result == NULL)
6692 goto onError;
6693 res = PyUnicode_AS_UNICODE(result);
6694
6695 if (PyTuple_Check(args)) {
6696 arglen = PyTuple_Size(args);
6697 argidx = 0;
6698 }
6699 else {
6700 arglen = -1;
6701 argidx = -2;
6702 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006703 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6704 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705 dict = args;
6706
6707 while (--fmtcnt >= 0) {
6708 if (*fmt != '%') {
6709 if (--rescnt < 0) {
6710 rescnt = fmtcnt + 100;
6711 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006712 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 return NULL;
6714 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6715 --rescnt;
6716 }
6717 *res++ = *fmt++;
6718 }
6719 else {
6720 /* Got a format specifier */
6721 int flags = 0;
6722 int width = -1;
6723 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724 Py_UNICODE c = '\0';
6725 Py_UNICODE fill;
6726 PyObject *v = NULL;
6727 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006728 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 Py_UNICODE sign;
6730 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006731 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732
6733 fmt++;
6734 if (*fmt == '(') {
6735 Py_UNICODE *keystart;
6736 int keylen;
6737 PyObject *key;
6738 int pcount = 1;
6739
6740 if (dict == NULL) {
6741 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006742 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 goto onError;
6744 }
6745 ++fmt;
6746 --fmtcnt;
6747 keystart = fmt;
6748 /* Skip over balanced parentheses */
6749 while (pcount > 0 && --fmtcnt >= 0) {
6750 if (*fmt == ')')
6751 --pcount;
6752 else if (*fmt == '(')
6753 ++pcount;
6754 fmt++;
6755 }
6756 keylen = fmt - keystart - 1;
6757 if (fmtcnt < 0 || pcount > 0) {
6758 PyErr_SetString(PyExc_ValueError,
6759 "incomplete format key");
6760 goto onError;
6761 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006762#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006763 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 then looked up since Python uses strings to hold
6765 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006766 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 key = PyUnicode_EncodeUTF8(keystart,
6768 keylen,
6769 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006770#else
6771 key = PyUnicode_FromUnicode(keystart, keylen);
6772#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 if (key == NULL)
6774 goto onError;
6775 if (args_owned) {
6776 Py_DECREF(args);
6777 args_owned = 0;
6778 }
6779 args = PyObject_GetItem(dict, key);
6780 Py_DECREF(key);
6781 if (args == NULL) {
6782 goto onError;
6783 }
6784 args_owned = 1;
6785 arglen = -1;
6786 argidx = -2;
6787 }
6788 while (--fmtcnt >= 0) {
6789 switch (c = *fmt++) {
6790 case '-': flags |= F_LJUST; continue;
6791 case '+': flags |= F_SIGN; continue;
6792 case ' ': flags |= F_BLANK; continue;
6793 case '#': flags |= F_ALT; continue;
6794 case '0': flags |= F_ZERO; continue;
6795 }
6796 break;
6797 }
6798 if (c == '*') {
6799 v = getnextarg(args, arglen, &argidx);
6800 if (v == NULL)
6801 goto onError;
6802 if (!PyInt_Check(v)) {
6803 PyErr_SetString(PyExc_TypeError,
6804 "* wants int");
6805 goto onError;
6806 }
6807 width = PyInt_AsLong(v);
6808 if (width < 0) {
6809 flags |= F_LJUST;
6810 width = -width;
6811 }
6812 if (--fmtcnt >= 0)
6813 c = *fmt++;
6814 }
6815 else if (c >= '0' && c <= '9') {
6816 width = c - '0';
6817 while (--fmtcnt >= 0) {
6818 c = *fmt++;
6819 if (c < '0' || c > '9')
6820 break;
6821 if ((width*10) / 10 != width) {
6822 PyErr_SetString(PyExc_ValueError,
6823 "width too big");
6824 goto onError;
6825 }
6826 width = width*10 + (c - '0');
6827 }
6828 }
6829 if (c == '.') {
6830 prec = 0;
6831 if (--fmtcnt >= 0)
6832 c = *fmt++;
6833 if (c == '*') {
6834 v = getnextarg(args, arglen, &argidx);
6835 if (v == NULL)
6836 goto onError;
6837 if (!PyInt_Check(v)) {
6838 PyErr_SetString(PyExc_TypeError,
6839 "* wants int");
6840 goto onError;
6841 }
6842 prec = PyInt_AsLong(v);
6843 if (prec < 0)
6844 prec = 0;
6845 if (--fmtcnt >= 0)
6846 c = *fmt++;
6847 }
6848 else if (c >= '0' && c <= '9') {
6849 prec = c - '0';
6850 while (--fmtcnt >= 0) {
6851 c = Py_CHARMASK(*fmt++);
6852 if (c < '0' || c > '9')
6853 break;
6854 if ((prec*10) / 10 != prec) {
6855 PyErr_SetString(PyExc_ValueError,
6856 "prec too big");
6857 goto onError;
6858 }
6859 prec = prec*10 + (c - '0');
6860 }
6861 }
6862 } /* prec */
6863 if (fmtcnt >= 0) {
6864 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865 if (--fmtcnt >= 0)
6866 c = *fmt++;
6867 }
6868 }
6869 if (fmtcnt < 0) {
6870 PyErr_SetString(PyExc_ValueError,
6871 "incomplete format");
6872 goto onError;
6873 }
6874 if (c != '%') {
6875 v = getnextarg(args, arglen, &argidx);
6876 if (v == NULL)
6877 goto onError;
6878 }
6879 sign = 0;
6880 fill = ' ';
6881 switch (c) {
6882
6883 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006884 pbuf = formatbuf;
6885 /* presume that buffer length is at least 1 */
6886 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 len = 1;
6888 break;
6889
6890 case 's':
6891 case 'r':
6892 if (PyUnicode_Check(v) && c == 's') {
6893 temp = v;
6894 Py_INCREF(temp);
6895 }
6896 else {
6897 PyObject *unicode;
6898 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006899 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900 else
6901 temp = PyObject_Repr(v);
6902 if (temp == NULL)
6903 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006904 if (PyUnicode_Check(temp))
6905 /* nothing to do */;
6906 else if (PyString_Check(temp)) {
6907 /* convert to string to Unicode */
Fred Drakee4315f52000-05-09 19:53:39 +00006908 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006910 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911 "strict");
6912 Py_DECREF(temp);
6913 temp = unicode;
6914 if (temp == NULL)
6915 goto onError;
6916 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006917 else {
6918 Py_DECREF(temp);
6919 PyErr_SetString(PyExc_TypeError,
6920 "%s argument has non-string str()");
6921 goto onError;
6922 }
6923 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006924 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925 len = PyUnicode_GET_SIZE(temp);
6926 if (prec >= 0 && len > prec)
6927 len = prec;
6928 break;
6929
6930 case 'i':
6931 case 'd':
6932 case 'u':
6933 case 'o':
6934 case 'x':
6935 case 'X':
6936 if (c == 'i')
6937 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006938 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006939 temp = formatlong(v, flags, prec, c);
6940 if (!temp)
6941 goto onError;
6942 pbuf = PyUnicode_AS_UNICODE(temp);
6943 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00006944 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006946 else {
6947 pbuf = formatbuf;
6948 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6949 flags, prec, c, v);
6950 if (len < 0)
6951 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006952 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006953 }
6954 if (flags & F_ZERO)
6955 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 break;
6957
6958 case 'e':
6959 case 'E':
6960 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006961 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 case 'g':
6963 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006964 if (c == 'F')
6965 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006966 pbuf = formatbuf;
6967 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6968 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969 if (len < 0)
6970 goto onError;
6971 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006972 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973 fill = '0';
6974 break;
6975
6976 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006977 pbuf = formatbuf;
6978 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 if (len < 0)
6980 goto onError;
6981 break;
6982
6983 default:
6984 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006985 "unsupported format character '%c' (0x%x) "
6986 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00006987 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006988 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006989 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990 goto onError;
6991 }
6992 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006993 if (*pbuf == '-' || *pbuf == '+') {
6994 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 len--;
6996 }
6997 else if (flags & F_SIGN)
6998 sign = '+';
6999 else if (flags & F_BLANK)
7000 sign = ' ';
7001 else
7002 sign = 0;
7003 }
7004 if (width < len)
7005 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007006 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007 reslen -= rescnt;
7008 rescnt = width + fmtcnt + 100;
7009 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007010 if (reslen < 0) {
7011 Py_DECREF(result);
7012 return PyErr_NoMemory();
7013 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007014 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015 return NULL;
7016 res = PyUnicode_AS_UNICODE(result)
7017 + reslen - rescnt;
7018 }
7019 if (sign) {
7020 if (fill != ' ')
7021 *res++ = sign;
7022 rescnt--;
7023 if (width > len)
7024 width--;
7025 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007026 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7027 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007028 assert(pbuf[1] == c);
7029 if (fill != ' ') {
7030 *res++ = *pbuf++;
7031 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007032 }
Tim Petersfff53252001-04-12 18:38:48 +00007033 rescnt -= 2;
7034 width -= 2;
7035 if (width < 0)
7036 width = 0;
7037 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007038 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039 if (width > len && !(flags & F_LJUST)) {
7040 do {
7041 --rescnt;
7042 *res++ = fill;
7043 } while (--width > len);
7044 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007045 if (fill == ' ') {
7046 if (sign)
7047 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007048 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007049 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007050 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007051 *res++ = *pbuf++;
7052 *res++ = *pbuf++;
7053 }
7054 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007055 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056 res += len;
7057 rescnt -= len;
7058 while (--width >= len) {
7059 --rescnt;
7060 *res++ = ' ';
7061 }
7062 if (dict && (argidx < arglen) && c != '%') {
7063 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007064 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065 goto onError;
7066 }
7067 Py_XDECREF(temp);
7068 } /* '%' */
7069 } /* until end */
7070 if (argidx < arglen && !dict) {
7071 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007072 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073 goto onError;
7074 }
7075
7076 if (args_owned) {
7077 Py_DECREF(args);
7078 }
7079 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00007080 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007081 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082 return (PyObject *)result;
7083
7084 onError:
7085 Py_XDECREF(result);
7086 Py_DECREF(uformat);
7087 if (args_owned) {
7088 Py_DECREF(args);
7089 }
7090 return NULL;
7091}
7092
7093static PyBufferProcs unicode_as_buffer = {
7094 (getreadbufferproc) unicode_buffer_getreadbuf,
7095 (getwritebufferproc) unicode_buffer_getwritebuf,
7096 (getsegcountproc) unicode_buffer_getsegcount,
7097 (getcharbufferproc) unicode_buffer_getcharbuf,
7098};
7099
Jeremy Hylton938ace62002-07-17 16:30:39 +00007100static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007101unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7102
Tim Peters6d6c1a32001-08-02 04:15:00 +00007103static PyObject *
7104unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7105{
7106 PyObject *x = NULL;
7107 static char *kwlist[] = {"string", "encoding", "errors", 0};
7108 char *encoding = NULL;
7109 char *errors = NULL;
7110
Guido van Rossume023fe02001-08-30 03:12:59 +00007111 if (type != &PyUnicode_Type)
7112 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007113 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7114 kwlist, &x, &encoding, &errors))
7115 return NULL;
7116 if (x == NULL)
7117 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007118 if (encoding == NULL && errors == NULL)
7119 return PyObject_Unicode(x);
7120 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007121 return PyUnicode_FromEncodedObject(x, encoding, errors);
7122}
7123
Guido van Rossume023fe02001-08-30 03:12:59 +00007124static PyObject *
7125unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7126{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007127 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007128 int n;
7129
7130 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7131 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7132 if (tmp == NULL)
7133 return NULL;
7134 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007135 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007136 if (pnew == NULL) {
7137 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007138 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007139 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007140 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7141 if (pnew->str == NULL) {
7142 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007143 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007144 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007145 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007146 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007147 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7148 pnew->length = n;
7149 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007150 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007151 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007152}
7153
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007154PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007155"unicode(string [, encoding[, errors]]) -> object\n\
7156\n\
7157Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007158encoding defaults to the current default string encoding.\n\
7159errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007160
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161PyTypeObject PyUnicode_Type = {
7162 PyObject_HEAD_INIT(&PyType_Type)
7163 0, /* ob_size */
7164 "unicode", /* tp_name */
7165 sizeof(PyUnicodeObject), /* tp_size */
7166 0, /* tp_itemsize */
7167 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007168 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007170 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171 0, /* tp_setattr */
7172 (cmpfunc) unicode_compare, /* tp_compare */
7173 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007174 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007176 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177 (hashfunc) unicode_hash, /* tp_hash*/
7178 0, /* tp_call*/
7179 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007180 PyObject_GenericGetAttr, /* tp_getattro */
7181 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007183 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7184 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007185 unicode_doc, /* tp_doc */
7186 0, /* tp_traverse */
7187 0, /* tp_clear */
7188 0, /* tp_richcompare */
7189 0, /* tp_weaklistoffset */
7190 0, /* tp_iter */
7191 0, /* tp_iternext */
7192 unicode_methods, /* tp_methods */
7193 0, /* tp_members */
7194 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007195 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007196 0, /* tp_dict */
7197 0, /* tp_descr_get */
7198 0, /* tp_descr_set */
7199 0, /* tp_dictoffset */
7200 0, /* tp_init */
7201 0, /* tp_alloc */
7202 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007203 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204};
7205
7206/* Initialize the Unicode implementation */
7207
Thomas Wouters78890102000-07-22 19:25:51 +00007208void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007210 int i;
7211
Fred Drakee4315f52000-05-09 19:53:39 +00007212 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007213 unicode_freelist = NULL;
7214 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007216 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007217 for (i = 0; i < 256; i++)
7218 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007219 if (PyType_Ready(&PyUnicode_Type) < 0)
7220 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221}
7222
7223/* Finalize the Unicode implementation */
7224
7225void
Thomas Wouters78890102000-07-22 19:25:51 +00007226_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007228 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007229 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007231 Py_XDECREF(unicode_empty);
7232 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007233
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007234 for (i = 0; i < 256; i++) {
7235 if (unicode_latin1[i]) {
7236 Py_DECREF(unicode_latin1[i]);
7237 unicode_latin1[i] = NULL;
7238 }
7239 }
7240
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007241 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242 PyUnicodeObject *v = u;
7243 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007244 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007245 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007246 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007247 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007249 unicode_freelist = NULL;
7250 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007252
7253/*
7254Local variables:
7255c-basic-offset: 4
7256indent-tabs-mode: nil
7257End:
7258*/