blob: 2fe96681a0564dfb324496871211f8df99425c93 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000222 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
281 }
282
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
294{
295 PyUnicodeObject *unicode;
296
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
300
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
305 }
306
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000313 if (!unicode)
314 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000315 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 unicode_latin1[*u] = unicode;
317 }
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
320 }
321 }
322
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
326
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330
331 return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
338{
339 PyUnicodeObject *unicode;
340
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
344 }
345
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
349
350 /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354 {
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
360 }
361#endif
362
363 return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
369{
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
373 }
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379 {
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
385 }
386#endif
387
388 return size;
389}
390
391#endif
392
393PyObject *PyUnicode_FromObject(register PyObject *obj)
394{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000395 /* XXX Perhaps we should make this API an alias of
396 PyObject_Unicode() instead ?! */
397 if (PyUnicode_CheckExact(obj)) {
398 Py_INCREF(obj);
399 return obj;
400 }
401 if (PyUnicode_Check(obj)) {
402 /* For a Unicode subtype that's not a Unicode object,
403 return a true Unicode object with the same data. */
404 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
405 PyUnicode_GET_SIZE(obj));
406 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000407 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
408}
409
410PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
411 const char *encoding,
412 const char *errors)
413{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000414 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000416 int owned = 0;
417 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418
419 if (obj == NULL) {
420 PyErr_BadInternalCall();
421 return NULL;
422 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000423
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000424#if 0
425 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000426 that no encodings is given and then redirect to
427 PyObject_Unicode() which then applies the additional logic for
428 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000429
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000430 NOTE: This API should really only be used for object which
431 represent *encoded* Unicode !
432
433 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000434 if (PyUnicode_Check(obj)) {
435 if (encoding) {
436 PyErr_SetString(PyExc_TypeError,
437 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000438 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000439 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000440 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000441 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000442#else
443 if (PyUnicode_Check(obj)) {
444 PyErr_SetString(PyExc_TypeError,
445 "decoding Unicode is not supported");
446 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000448#endif
449
450 /* Coerce object */
451 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000452 s = PyString_AS_STRING(obj);
453 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000454 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000455 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
456 /* Overwrite the error message with something more useful in
457 case of a TypeError. */
458 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000459 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000460 "coercing to Unicode: need string or buffer, "
461 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000462 obj->ob_type->tp_name);
463 goto onError;
464 }
465
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000466 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 if (len == 0) {
468 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000471 else
472 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000473
Greg Steinaf36a3a2000-07-17 09:04:43 +0000474 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000475 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000476 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000477 return v;
478
479 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000480 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000481 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000482 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484}
485
486PyObject *PyUnicode_Decode(const char *s,
487 int size,
488 const char *encoding,
489 const char *errors)
490{
491 PyObject *buffer = NULL, *unicode;
492
Fred Drakee4315f52000-05-09 19:53:39 +0000493 if (encoding == NULL)
494 encoding = PyUnicode_GetDefaultEncoding();
495
496 /* Shortcuts for common default encodings */
497 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000499 else if (strcmp(encoding, "latin-1") == 0)
500 return PyUnicode_DecodeLatin1(s, size, errors);
501 else if (strcmp(encoding, "ascii") == 0)
502 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503
504 /* Decode via the codec registry */
505 buffer = PyBuffer_FromMemory((void *)s, size);
506 if (buffer == NULL)
507 goto onError;
508 unicode = PyCodec_Decode(buffer, encoding, errors);
509 if (unicode == NULL)
510 goto onError;
511 if (!PyUnicode_Check(unicode)) {
512 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000513 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 unicode->ob_type->tp_name);
515 Py_DECREF(unicode);
516 goto onError;
517 }
518 Py_DECREF(buffer);
519 return unicode;
520
521 onError:
522 Py_XDECREF(buffer);
523 return NULL;
524}
525
526PyObject *PyUnicode_Encode(const Py_UNICODE *s,
527 int size,
528 const char *encoding,
529 const char *errors)
530{
531 PyObject *v, *unicode;
532
533 unicode = PyUnicode_FromUnicode(s, size);
534 if (unicode == NULL)
535 return NULL;
536 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
537 Py_DECREF(unicode);
538 return v;
539}
540
541PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
542 const char *encoding,
543 const char *errors)
544{
545 PyObject *v;
546
547 if (!PyUnicode_Check(unicode)) {
548 PyErr_BadArgument();
549 goto onError;
550 }
Fred Drakee4315f52000-05-09 19:53:39 +0000551
552 if (encoding == NULL)
553 encoding = PyUnicode_GetDefaultEncoding();
554
555 /* Shortcuts for common default encodings */
556 if (errors == NULL) {
557 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000558 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000559 else if (strcmp(encoding, "latin-1") == 0)
560 return PyUnicode_AsLatin1String(unicode);
561 else if (strcmp(encoding, "ascii") == 0)
562 return PyUnicode_AsASCIIString(unicode);
563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000564
565 /* Encode via the codec registry */
566 v = PyCodec_Encode(unicode, encoding, errors);
567 if (v == NULL)
568 goto onError;
569 /* XXX Should we really enforce this ? */
570 if (!PyString_Check(v)) {
571 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000572 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 v->ob_type->tp_name);
574 Py_DECREF(v);
575 goto onError;
576 }
577 return v;
578
579 onError:
580 return NULL;
581}
582
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584 const char *errors)
585{
586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
587
588 if (v)
589 return v;
590 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591 if (v && errors == NULL)
592 ((PyUnicodeObject *)unicode)->defenc = v;
593 return v;
594}
595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
597{
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
601 }
602 return PyUnicode_AS_UNICODE(unicode);
603
604 onError:
605 return NULL;
606}
607
608int PyUnicode_GetSize(PyObject *unicode)
609{
610 if (!PyUnicode_Check(unicode)) {
611 PyErr_BadArgument();
612 goto onError;
613 }
614 return PyUnicode_GET_SIZE(unicode);
615
616 onError:
617 return -1;
618}
619
Thomas Wouters78890102000-07-22 19:25:51 +0000620const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000621{
622 return unicode_default_encoding;
623}
624
625int PyUnicode_SetDefaultEncoding(const char *encoding)
626{
627 PyObject *v;
628
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v = _PyCodec_Lookup(encoding);
632 if (v == NULL)
633 goto onError;
634 Py_DECREF(v);
635 strncpy(unicode_default_encoding,
636 encoding,
637 sizeof(unicode_default_encoding));
638 return 0;
639
640 onError:
641 return -1;
642}
643
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000644/* --- UTF-7 Codec -------------------------------------------------------- */
645
646/* see RFC2152 for details */
647
648static
649char utf7_special[128] = {
650 /* indicate whether a UTF-7 character is special i.e. cannot be directly
651 encoded:
652 0 - not special
653 1 - special
654 2 - whitespace (optional)
655 3 - RFC2152 Set O (optional) */
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
660 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
662 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
664
665};
666
667#define SPECIAL(c, encodeO, encodeWS) \
668 (((c)>127 || utf7_special[(c)] == 1) || \
669 (encodeWS && (utf7_special[(c)] == 2)) || \
670 (encodeO && (utf7_special[(c)] == 3)))
671
672#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
673#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
674#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
675 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
676
677#define ENCODE(out, ch, bits) \
678 while (bits >= 6) { \
679 *out++ = B64(ch >> (bits-6)); \
680 bits -= 6; \
681 }
682
683#define DECODE(out, ch, bits, surrogate) \
684 while (bits >= 16) { \
685 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
686 bits -= 16; \
687 if (surrogate) { \
688 /* We have already generated an error for the high surrogate
689 so let's not bother seeing if the low surrogate is correct or not */\
690 surrogate = 0; \
691 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
692 /* This is a surrogate pair. Unfortunately we can't represent \
693 it in a 16-bit character */ \
694 surrogate = 1; \
695 errmsg = "code pairs are not supported"; \
696 goto utf7Error; \
697 } else { \
698 *out++ = outCh; \
699 } \
700 } \
701
702static
703int utf7_decoding_error(Py_UNICODE **dest,
704 const char *errors,
705 const char *details)
706{
707 if ((errors == NULL) ||
708 (strcmp(errors,"strict") == 0)) {
709 PyErr_Format(PyExc_UnicodeError,
710 "UTF-7 decoding error: %.400s",
711 details);
712 return -1;
713 }
714 else if (strcmp(errors,"ignore") == 0) {
715 return 0;
716 }
717 else if (strcmp(errors,"replace") == 0) {
718 if (dest != NULL) {
719 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
720 (*dest)++;
721 }
722 return 0;
723 }
724 else {
725 PyErr_Format(PyExc_ValueError,
726 "UTF-7 decoding error; unknown error handling code: %.400s",
727 errors);
728 return -1;
729 }
730}
731
732PyObject *PyUnicode_DecodeUTF7(const char *s,
733 int size,
734 const char *errors)
735{
736 const char *e;
737 PyUnicodeObject *unicode;
738 Py_UNICODE *p;
739 const char *errmsg = "";
740 int inShift = 0;
741 unsigned int bitsleft = 0;
742 unsigned long charsleft = 0;
743 int surrogate = 0;
744
745 unicode = _PyUnicode_New(size);
746 if (!unicode)
747 return NULL;
748 if (size == 0)
749 return (PyObject *)unicode;
750
751 p = unicode->str;
752 e = s + size;
753
754 while (s < e) {
755 Py_UNICODE ch = *s;
756
757 if (inShift) {
758 if ((ch == '-') || !B64CHAR(ch)) {
759 inShift = 0;
760 s++;
761
762 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
763 if (bitsleft >= 6) {
764 /* The shift sequence has a partial character in it. If
765 bitsleft < 6 then we could just classify it as padding
766 but that is not the case here */
767
768 errmsg = "partial character in shift sequence";
769 goto utf7Error;
770 }
771 /* According to RFC2152 the remaining bits should be zero. We
772 choose to signal an error/insert a replacement character
773 here so indicate the potential of a misencoded character. */
774
775 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
776 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
777 errmsg = "non-zero padding bits in shift sequence";
778 goto utf7Error;
779 }
780
781 if (ch == '-') {
782 if ((s < e) && (*(s) == '-')) {
783 *p++ = '-';
784 inShift = 1;
785 }
786 } else if (SPECIAL(ch,0,0)) {
787 errmsg = "unexpected special character";
788 goto utf7Error;
789 } else {
790 *p++ = ch;
791 }
792 } else {
793 charsleft = (charsleft << 6) | UB64(ch);
794 bitsleft += 6;
795 s++;
796 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
797 }
798 }
799 else if ( ch == '+' ) {
800 s++;
801 if (s < e && *s == '-') {
802 s++;
803 *p++ = '+';
804 } else
805 {
806 inShift = 1;
807 bitsleft = 0;
808 }
809 }
810 else if (SPECIAL(ch,0,0)) {
811 errmsg = "unexpected special character";
812 s++;
813 goto utf7Error;
814 }
815 else {
816 *p++ = ch;
817 s++;
818 }
819 continue;
820 utf7Error:
821 if (utf7_decoding_error(&p, errors, errmsg))
822 goto onError;
823 }
824
825 if (inShift) {
826 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
827 goto onError;
828 }
829
830 if (_PyUnicode_Resize(&unicode, p - unicode->str))
831 goto onError;
832
833 return (PyObject *)unicode;
834
835onError:
836 Py_DECREF(unicode);
837 return NULL;
838}
839
840
841PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
842 int size,
843 int encodeSetO,
844 int encodeWhiteSpace,
845 const char *errors)
846{
847 PyObject *v;
848 /* It might be possible to tighten this worst case */
849 unsigned int cbAllocated = 5 * size;
850 int inShift = 0;
851 int i = 0;
852 unsigned int bitsleft = 0;
853 unsigned long charsleft = 0;
854 char * out;
855 char * start;
856
857 if (size == 0)
858 return PyString_FromStringAndSize(NULL, 0);
859
860 v = PyString_FromStringAndSize(NULL, cbAllocated);
861 if (v == NULL)
862 return NULL;
863
864 start = out = PyString_AS_STRING(v);
865 for (;i < size; ++i) {
866 Py_UNICODE ch = s[i];
867
868 if (!inShift) {
869 if (ch == '+') {
870 *out++ = '+';
871 *out++ = '-';
872 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
873 charsleft = ch;
874 bitsleft = 16;
875 *out++ = '+';
876 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
877 inShift = bitsleft > 0;
878 } else {
879 *out++ = (char) ch;
880 }
881 } else {
882 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
883 *out++ = B64(charsleft << (6-bitsleft));
884 charsleft = 0;
885 bitsleft = 0;
886 /* Characters not in the BASE64 set implicitly unshift the sequence
887 so no '-' is required, except if the character is itself a '-' */
888 if (B64CHAR(ch) || ch == '-') {
889 *out++ = '-';
890 }
891 inShift = 0;
892 *out++ = (char) ch;
893 } else {
894 bitsleft += 16;
895 charsleft = (charsleft << 16) | ch;
896 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
897
898 /* If the next character is special then we dont' need to terminate
899 the shift sequence. If the next character is not a BASE64 character
900 or '-' then the shift sequence will be terminated implicitly and we
901 don't have to insert a '-'. */
902
903 if (bitsleft == 0) {
904 if (i + 1 < size) {
905 Py_UNICODE ch2 = s[i+1];
906
907 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
908
909 } else if (B64CHAR(ch2) || ch2 == '-') {
910 *out++ = '-';
911 inShift = 0;
912 } else {
913 inShift = 0;
914 }
915
916 }
917 else {
918 *out++ = '-';
919 inShift = 0;
920 }
921 }
922 }
923 }
924 }
925 if (bitsleft) {
926 *out++= B64(charsleft << (6-bitsleft) );
927 *out++ = '-';
928 }
929
930 if (_PyString_Resize(&v, out - start)) {
931 Py_DECREF(v);
932 return NULL;
933 }
934 return v;
935}
936
937#undef SPECIAL
938#undef B64
939#undef B64CHAR
940#undef UB64
941#undef ENCODE
942#undef DECODE
943
Guido van Rossumd57fd912000-03-10 22:53:23 +0000944/* --- UTF-8 Codec -------------------------------------------------------- */
945
946static
947char utf8_code_length[256] = {
948 /* Map UTF-8 encoded prefix byte to sequence length. zero means
949 illegal prefix. see RFC 2279 for details */
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
962 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
963 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
964 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
965 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
966};
967
968static
969int utf8_decoding_error(const char **source,
970 Py_UNICODE **dest,
971 const char *errors,
972 const char *details)
973{
974 if ((errors == NULL) ||
975 (strcmp(errors,"strict") == 0)) {
976 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000977 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000978 details);
979 return -1;
980 }
981 else if (strcmp(errors,"ignore") == 0) {
982 (*source)++;
983 return 0;
984 }
985 else if (strcmp(errors,"replace") == 0) {
986 (*source)++;
987 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
988 (*dest)++;
989 return 0;
990 }
991 else {
992 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000993 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000994 errors);
995 return -1;
996 }
997}
998
Guido van Rossumd57fd912000-03-10 22:53:23 +0000999PyObject *PyUnicode_DecodeUTF8(const char *s,
1000 int size,
1001 const char *errors)
1002{
1003 int n;
1004 const char *e;
1005 PyUnicodeObject *unicode;
1006 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001007 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008
1009 /* Note: size will always be longer than the resulting Unicode
1010 character count */
1011 unicode = _PyUnicode_New(size);
1012 if (!unicode)
1013 return NULL;
1014 if (size == 0)
1015 return (PyObject *)unicode;
1016
1017 /* Unpack UTF-8 encoded data */
1018 p = unicode->str;
1019 e = s + size;
1020
1021 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001022 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023
1024 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001025 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026 s++;
1027 continue;
1028 }
1029
1030 n = utf8_code_length[ch];
1031
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001032 if (s + n > e) {
1033 errmsg = "unexpected end of data";
1034 goto utf8Error;
1035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036
1037 switch (n) {
1038
1039 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001040 errmsg = "unexpected code byte";
1041 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042
1043 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001044 errmsg = "internal error";
1045 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046
1047 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001048 if ((s[1] & 0xc0) != 0x80) {
1049 errmsg = "invalid data";
1050 goto utf8Error;
1051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001053 if (ch < 0x80) {
1054 errmsg = "illegal encoding";
1055 goto utf8Error;
1056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001058 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059 break;
1060
1061 case 3:
1062 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001063 (s[2] & 0xc0) != 0x80) {
1064 errmsg = "invalid data";
1065 goto utf8Error;
1066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001068 if (ch < 0x0800) {
1069 /* Note: UTF-8 encodings of surrogates are considered
1070 legal UTF-8 sequences;
1071
1072 XXX For wide builds (UCS-4) we should probably try
1073 to recombine the surrogates into a single code
1074 unit.
1075 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001076 errmsg = "illegal encoding";
1077 goto utf8Error;
1078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001080 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001081 break;
1082
1083 case 4:
1084 if ((s[1] & 0xc0) != 0x80 ||
1085 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001086 (s[3] & 0xc0) != 0x80) {
1087 errmsg = "invalid data";
1088 goto utf8Error;
1089 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001090 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1091 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1092 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001093 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001094 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001095 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001096 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001097 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001098 errmsg = "illegal encoding";
1099 goto utf8Error;
1100 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001101#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001102 *p++ = (Py_UNICODE)ch;
1103#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001104 /* compute and append the two surrogates: */
1105
1106 /* translate from 10000..10FFFF to 0..FFFF */
1107 ch -= 0x10000;
1108
1109 /* high surrogate = top 10 bits added to D800 */
1110 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1111
1112 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001113 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001114#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 break;
1116
1117 default:
1118 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001119 errmsg = "unsupported Unicode code range";
1120 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 }
1122 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001123 continue;
1124
1125 utf8Error:
1126 if (utf8_decoding_error(&s, &p, errors, errmsg))
1127 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128 }
1129
1130 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001131 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132 goto onError;
1133
1134 return (PyObject *)unicode;
1135
1136onError:
1137 Py_DECREF(unicode);
1138 return NULL;
1139}
1140
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001141/* Not used anymore, now that the encoder supports UTF-16
1142 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +00001143#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144static
1145int utf8_encoding_error(const Py_UNICODE **source,
1146 char **dest,
1147 const char *errors,
1148 const char *details)
1149{
1150 if ((errors == NULL) ||
1151 (strcmp(errors,"strict") == 0)) {
1152 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001153 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 details);
1155 return -1;
1156 }
1157 else if (strcmp(errors,"ignore") == 0) {
1158 return 0;
1159 }
1160 else if (strcmp(errors,"replace") == 0) {
1161 **dest = '?';
1162 (*dest)++;
1163 return 0;
1164 }
1165 else {
1166 PyErr_Format(PyExc_ValueError,
1167 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001168 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 errors);
1170 return -1;
1171 }
1172}
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001173#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001175/* Allocation strategy: we default to Latin-1, then do one resize
1176 whenever we hit an order boundary. The assumption is that
1177 characters from higher orders usually occur often enough to warrant
1178 this.
1179*/
1180
Tim Peters7e3d9612002-04-21 03:26:37 +00001181PyObject *
1182PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1183 int size,
1184 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185{
1186 PyObject *v;
1187 char *p;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001188 int len;
Tim Peters0eca65c2002-04-21 17:28:06 +00001189 int i = 0;
1190 long overalloc = 2;
1191 int nallocated; /* overalloc * size; PyString_ adds one more for \0 */
1192
1193 /* Short-cut for empty strings */
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001194 if (size == 0)
1195 return PyString_FromStringAndSize(NULL, 0);
1196
Tim Peters0eca65c2002-04-21 17:28:06 +00001197 nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
1198 v = PyString_FromStringAndSize(NULL, nallocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001199 if (v == NULL)
1200 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001202 p = PyString_AS_STRING(v);
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001203
1204 while (i < size) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001205 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001206
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001207 if (ch < 0x80)
1208 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001210
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 else if (ch < 0x0800) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001212 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001213 *p++ = (char)(0xc0 | (ch >> 6));
1214 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001215 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001216
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001217 else {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001218 /* Encode UCS2 Unicode ordinals */
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001219 if (ch < 0x10000) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001220
1221 /* Special case: check for high surrogate */
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001222 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1223 Py_UCS4 ch2 = s[i];
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001224 /* Check for low surrogate and combine the two to
1225 form a UCS4 value */
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001226 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001227 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1228 i++;
1229 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001230 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001231 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001232 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001233
1234 if (overalloc < 3) {
Tim Peters0eca65c2002-04-21 17:28:06 +00001235 len = Py_SAFE_DOWNCAST(p-PyString_AS_STRING(v), long, int);
1236 assert(len <= nallocated);
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001237 overalloc = 3;
Tim Peters0eca65c2002-04-21 17:28:06 +00001238 nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
1239 if (_PyString_Resize(&v, nallocated))
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001240 goto onError;
1241 p = PyString_AS_STRING(v) + len;
1242 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001243 *p++ = (char)(0xe0 | (ch >> 12));
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001244 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1245 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001246 continue;
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001247 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001248
1249 /* Encode UCS4 Unicode ordinals */
1250 encodeUCS4:
1251 if (overalloc < 4) {
Tim Peters0eca65c2002-04-21 17:28:06 +00001252 len = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1253 assert(len <= nallocated);
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001254 overalloc = 4;
Tim Peters0eca65c2002-04-21 17:28:06 +00001255 nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
1256 if (_PyString_Resize(&v, nallocated))
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001257 goto onError;
1258 p = PyString_AS_STRING(v) + len;
1259 }
1260 *p++ = (char)(0xf0 | (ch >> 18));
1261 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1262 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1263 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001264 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001265 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001266
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001267 *p = '\0';
Tim Peters0eca65c2002-04-21 17:28:06 +00001268 len = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1269 assert(len <= nallocated);
1270 if (_PyString_Resize(&v, len))
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001271 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001272 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001273
1274 onError:
1275 Py_DECREF(v);
1276 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277}
1278
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1280{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281 if (!PyUnicode_Check(unicode)) {
1282 PyErr_BadArgument();
1283 return NULL;
1284 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001285 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1286 PyUnicode_GET_SIZE(unicode),
1287 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288}
1289
1290/* --- UTF-16 Codec ------------------------------------------------------- */
1291
1292static
Tim Peters772747b2001-08-09 22:21:55 +00001293int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294 const char *errors,
1295 const char *details)
1296{
1297 if ((errors == NULL) ||
1298 (strcmp(errors,"strict") == 0)) {
1299 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001300 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001301 details);
1302 return -1;
1303 }
1304 else if (strcmp(errors,"ignore") == 0) {
1305 return 0;
1306 }
1307 else if (strcmp(errors,"replace") == 0) {
1308 if (dest) {
1309 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1310 (*dest)++;
1311 }
1312 return 0;
1313 }
1314 else {
1315 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001316 "UTF-16 decoding error; "
1317 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001318 errors);
1319 return -1;
1320 }
1321}
1322
Tim Peters772747b2001-08-09 22:21:55 +00001323PyObject *
1324PyUnicode_DecodeUTF16(const char *s,
1325 int size,
1326 const char *errors,
1327 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001328{
1329 PyUnicodeObject *unicode;
1330 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001331 const unsigned char *q, *e;
1332 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001333 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001334 /* Offsets from q for retrieving byte pairs in the right order. */
1335#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1336 int ihi = 1, ilo = 0;
1337#else
1338 int ihi = 0, ilo = 1;
1339#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001340
1341 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001342 if (size & 1) {
1343 if (utf16_decoding_error(NULL, errors, "truncated data"))
1344 return NULL;
1345 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346 }
1347
1348 /* Note: size will always be longer than the resulting Unicode
1349 character count */
1350 unicode = _PyUnicode_New(size);
1351 if (!unicode)
1352 return NULL;
1353 if (size == 0)
1354 return (PyObject *)unicode;
1355
1356 /* Unpack UTF-16 encoded data */
1357 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001358 q = (unsigned char *)s;
1359 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001360
1361 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001362 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001363
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001364 /* Check for BOM marks (U+FEFF) in the input and adjust current
1365 byte order setting accordingly. In native mode, the leading BOM
1366 mark is skipped, in all other modes, it is copied to the output
1367 stream as-is (giving a ZWNBSP character). */
1368 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001369 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001370#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001371 if (bom == 0xFEFF) {
1372 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001373 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001374 }
1375 else if (bom == 0xFFFE) {
1376 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001377 bo = 1;
1378 }
1379#else
Tim Peters772747b2001-08-09 22:21:55 +00001380 if (bom == 0xFEFF) {
1381 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001382 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001383 }
1384 else if (bom == 0xFFFE) {
1385 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001386 bo = -1;
1387 }
1388#endif
1389 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390
Tim Peters772747b2001-08-09 22:21:55 +00001391 if (bo == -1) {
1392 /* force LE */
1393 ihi = 1;
1394 ilo = 0;
1395 }
1396 else if (bo == 1) {
1397 /* force BE */
1398 ihi = 0;
1399 ilo = 1;
1400 }
1401
1402 while (q < e) {
1403 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1404 q += 2;
1405
Guido van Rossumd57fd912000-03-10 22:53:23 +00001406 if (ch < 0xD800 || ch > 0xDFFF) {
1407 *p++ = ch;
1408 continue;
1409 }
1410
1411 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001412 if (q >= e) {
1413 errmsg = "unexpected end of data";
1414 goto utf16Error;
1415 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001416 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001417 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1418 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001419 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001420#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001421 *p++ = ch;
1422 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001423#else
1424 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001425#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001426 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001427 }
1428 else {
1429 errmsg = "illegal UTF-16 surrogate";
1430 goto utf16Error;
1431 }
1432
Guido van Rossumd57fd912000-03-10 22:53:23 +00001433 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001434 errmsg = "illegal encoding";
1435 /* Fall through to report the error */
1436
1437 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001438 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001439 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001440 }
1441
1442 if (byteorder)
1443 *byteorder = bo;
1444
1445 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001446 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 goto onError;
1448
1449 return (PyObject *)unicode;
1450
1451onError:
1452 Py_DECREF(unicode);
1453 return NULL;
1454}
1455
Tim Peters772747b2001-08-09 22:21:55 +00001456PyObject *
1457PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1458 int size,
1459 const char *errors,
1460 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461{
1462 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001463 unsigned char *p;
1464 int i, pairs;
1465 /* Offsets from p for storing byte pairs in the right order. */
1466#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1467 int ihi = 1, ilo = 0;
1468#else
1469 int ihi = 0, ilo = 1;
1470#endif
1471
1472#define STORECHAR(CH) \
1473 do { \
1474 p[ihi] = ((CH) >> 8) & 0xff; \
1475 p[ilo] = (CH) & 0xff; \
1476 p += 2; \
1477 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001478
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001479 for (i = pairs = 0; i < size; i++)
1480 if (s[i] >= 0x10000)
1481 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001482 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001483 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001484 if (v == NULL)
1485 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486
Tim Peters772747b2001-08-09 22:21:55 +00001487 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001488 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001489 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001490 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001491 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001492
1493 if (byteorder == -1) {
1494 /* force LE */
1495 ihi = 1;
1496 ilo = 0;
1497 }
1498 else if (byteorder == 1) {
1499 /* force BE */
1500 ihi = 0;
1501 ilo = 1;
1502 }
1503
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001504 while (size-- > 0) {
1505 Py_UNICODE ch = *s++;
1506 Py_UNICODE ch2 = 0;
1507 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001508 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1509 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001510 }
Tim Peters772747b2001-08-09 22:21:55 +00001511 STORECHAR(ch);
1512 if (ch2)
1513 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001514 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001516#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001517}
1518
1519PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1520{
1521 if (!PyUnicode_Check(unicode)) {
1522 PyErr_BadArgument();
1523 return NULL;
1524 }
1525 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1526 PyUnicode_GET_SIZE(unicode),
1527 NULL,
1528 0);
1529}
1530
1531/* --- Unicode Escape Codec ----------------------------------------------- */
1532
1533static
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001534int unicodeescape_decoding_error(Py_UNICODE **x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001535 const char *errors,
1536 const char *details)
1537{
1538 if ((errors == NULL) ||
1539 (strcmp(errors,"strict") == 0)) {
1540 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001541 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 details);
1543 return -1;
1544 }
1545 else if (strcmp(errors,"ignore") == 0) {
1546 return 0;
1547 }
1548 else if (strcmp(errors,"replace") == 0) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001549 **x = Py_UNICODE_REPLACEMENT_CHARACTER;
1550 (*x)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001551 return 0;
1552 }
1553 else {
1554 PyErr_Format(PyExc_ValueError,
1555 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001556 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001557 errors);
1558 return -1;
1559 }
1560}
1561
Fredrik Lundh06d12682001-01-24 07:59:11 +00001562static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001563
Guido van Rossumd57fd912000-03-10 22:53:23 +00001564PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1565 int size,
1566 const char *errors)
1567{
1568 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001569 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001570 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001571 char* message;
1572 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1573
Guido van Rossumd57fd912000-03-10 22:53:23 +00001574 /* Escaped strings will always be longer than the resulting
1575 Unicode string, so we start with size here and then reduce the
1576 length after conversion to the true value. */
1577 v = _PyUnicode_New(size);
1578 if (v == NULL)
1579 goto onError;
1580 if (size == 0)
1581 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001582
Guido van Rossumd57fd912000-03-10 22:53:23 +00001583 p = buf = PyUnicode_AS_UNICODE(v);
1584 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001585
Guido van Rossumd57fd912000-03-10 22:53:23 +00001586 while (s < end) {
1587 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001588 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001589 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001590
1591 /* Non-escape characters are interpreted as Unicode ordinals */
1592 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001593 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001594 continue;
1595 }
1596
1597 /* \ - Escapes */
1598 s++;
1599 switch (*s++) {
1600
1601 /* \x escapes */
1602 case '\n': break;
1603 case '\\': *p++ = '\\'; break;
1604 case '\'': *p++ = '\''; break;
1605 case '\"': *p++ = '\"'; break;
1606 case 'b': *p++ = '\b'; break;
1607 case 'f': *p++ = '\014'; break; /* FF */
1608 case 't': *p++ = '\t'; break;
1609 case 'n': *p++ = '\n'; break;
1610 case 'r': *p++ = '\r'; break;
1611 case 'v': *p++ = '\013'; break; /* VT */
1612 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1613
1614 /* \OOO (octal) escapes */
1615 case '0': case '1': case '2': case '3':
1616 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001617 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001619 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001621 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001623 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001624 break;
1625
Fredrik Lundhccc74732001-02-18 22:13:49 +00001626 /* hex escapes */
1627 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001628 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001629 digits = 2;
1630 message = "truncated \\xXX escape";
1631 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001632
Fredrik Lundhccc74732001-02-18 22:13:49 +00001633 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001634 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001635 digits = 4;
1636 message = "truncated \\uXXXX escape";
1637 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001638
Fredrik Lundhccc74732001-02-18 22:13:49 +00001639 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001640 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001641 digits = 8;
1642 message = "truncated \\UXXXXXXXX escape";
1643 hexescape:
1644 chr = 0;
1645 for (i = 0; i < digits; i++) {
1646 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001647 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001648 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001649 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001650 chr = 0xffffffff;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001651 i++;
1652 break;
1653 }
1654 chr = (chr<<4) & ~0xF;
1655 if (c >= '0' && c <= '9')
1656 chr += c - '0';
1657 else if (c >= 'a' && c <= 'f')
1658 chr += 10 + c - 'a';
1659 else
1660 chr += 10 + c - 'A';
1661 }
1662 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001663 if (chr == 0xffffffff)
1664 /* _decoding_error will have already written into the
1665 target buffer. */
1666 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001667 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001668 /* when we get here, chr is a 32-bit unicode character */
1669 if (chr <= 0xffff)
1670 /* UCS-2 character */
1671 *p++ = (Py_UNICODE) chr;
1672 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001673 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001674 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001675#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001676 *p++ = chr;
1677#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001678 chr -= 0x10000L;
1679 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001680 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001681#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001682 } else {
1683 if (unicodeescape_decoding_error(
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001684 &p, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001685 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001686 )
1687 goto onError;
1688 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001689 break;
1690
1691 /* \N{name} */
1692 case 'N':
1693 message = "malformed \\N character escape";
1694 if (ucnhash_CAPI == NULL) {
1695 /* load the unicode data module */
1696 PyObject *m, *v;
1697 m = PyImport_ImportModule("unicodedata");
1698 if (m == NULL)
1699 goto ucnhashError;
1700 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1701 Py_DECREF(m);
1702 if (v == NULL)
1703 goto ucnhashError;
1704 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1705 Py_DECREF(v);
1706 if (ucnhash_CAPI == NULL)
1707 goto ucnhashError;
1708 }
1709 if (*s == '{') {
1710 const char *start = s+1;
1711 /* look for the closing brace */
1712 while (*s != '}' && s < end)
1713 s++;
1714 if (s > start && s < end && *s == '}') {
1715 /* found a name. look it up in the unicode database */
1716 message = "unknown Unicode character name";
1717 s++;
1718 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1719 goto store;
1720 }
1721 }
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001722 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001723 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001724 break;
1725
1726 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001727 if (s > end) {
1728 if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
1729 goto onError;
1730 }
1731 else {
1732 *p++ = '\\';
1733 *p++ = (unsigned char)s[-1];
1734 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001735 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736 }
1737 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001738 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Walter Dörwald8c077222002-03-25 11:16:18 +00001739 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001740 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001741
Fredrik Lundhccc74732001-02-18 22:13:49 +00001742ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001743 PyErr_SetString(
1744 PyExc_UnicodeError,
1745 "\\N escapes not supported (can't load unicodedata module)"
1746 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001747 return NULL;
1748
Fredrik Lundhccc74732001-02-18 22:13:49 +00001749onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750 Py_XDECREF(v);
1751 return NULL;
1752}
1753
1754/* Return a Unicode-Escape string version of the Unicode object.
1755
1756 If quotes is true, the string is enclosed in u"" or u'' quotes as
1757 appropriate.
1758
1759*/
1760
Barry Warsaw51ac5802000-03-20 16:36:48 +00001761static const Py_UNICODE *findchar(const Py_UNICODE *s,
1762 int size,
1763 Py_UNICODE ch);
1764
Guido van Rossumd57fd912000-03-10 22:53:23 +00001765static
1766PyObject *unicodeescape_string(const Py_UNICODE *s,
1767 int size,
1768 int quotes)
1769{
1770 PyObject *repr;
1771 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001773 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774
1775 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1776 if (repr == NULL)
1777 return NULL;
1778
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001779 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780
1781 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 *p++ = 'u';
1783 *p++ = (findchar(s, size, '\'') &&
1784 !findchar(s, size, '"')) ? '"' : '\'';
1785 }
1786 while (size-- > 0) {
1787 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001788
Guido van Rossumd57fd912000-03-10 22:53:23 +00001789 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001790 if (quotes &&
1791 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792 *p++ = '\\';
1793 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001794 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001796
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001797#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001798 /* Map 21-bit characters to '\U00xxxxxx' */
1799 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001800 int offset = p - PyString_AS_STRING(repr);
1801
1802 /* Resize the string if necessary */
1803 if (offset + 12 > PyString_GET_SIZE(repr)) {
1804 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1805 goto onError;
1806 p = PyString_AS_STRING(repr) + offset;
1807 }
1808
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001809 *p++ = '\\';
1810 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001811 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1812 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1813 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1814 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1815 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1816 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1817 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001818 *p++ = hexdigit[ch & 0x0000000F];
1819 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001820 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001821#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001822 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1823 else if (ch >= 0xD800 && ch < 0xDC00) {
1824 Py_UNICODE ch2;
1825 Py_UCS4 ucs;
1826
1827 ch2 = *s++;
1828 size--;
1829 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1830 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1831 *p++ = '\\';
1832 *p++ = 'U';
1833 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1834 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1835 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1836 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1837 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1838 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1839 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1840 *p++ = hexdigit[ucs & 0x0000000F];
1841 continue;
1842 }
1843 /* Fall through: isolated surrogates are copied as-is */
1844 s--;
1845 size++;
1846 }
1847
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001849 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001850 *p++ = '\\';
1851 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001852 *p++ = hexdigit[(ch >> 12) & 0x000F];
1853 *p++ = hexdigit[(ch >> 8) & 0x000F];
1854 *p++ = hexdigit[(ch >> 4) & 0x000F];
1855 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001857
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001858 /* Map special whitespace to '\t', \n', '\r' */
1859 else if (ch == '\t') {
1860 *p++ = '\\';
1861 *p++ = 't';
1862 }
1863 else if (ch == '\n') {
1864 *p++ = '\\';
1865 *p++ = 'n';
1866 }
1867 else if (ch == '\r') {
1868 *p++ = '\\';
1869 *p++ = 'r';
1870 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001871
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001872 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001873 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001874 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001875 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001876 *p++ = hexdigit[(ch >> 4) & 0x000F];
1877 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001879
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880 /* Copy everything else as-is */
1881 else
1882 *p++ = (char) ch;
1883 }
1884 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001885 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886
1887 *p = '\0';
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001888 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001889 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001890
1891 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001892
1893 onError:
1894 Py_DECREF(repr);
1895 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001896}
1897
1898PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1899 int size)
1900{
1901 return unicodeescape_string(s, size, 0);
1902}
1903
1904PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1905{
1906 if (!PyUnicode_Check(unicode)) {
1907 PyErr_BadArgument();
1908 return NULL;
1909 }
1910 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1911 PyUnicode_GET_SIZE(unicode));
1912}
1913
1914/* --- Raw Unicode Escape Codec ------------------------------------------- */
1915
1916PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1917 int size,
1918 const char *errors)
1919{
1920 PyUnicodeObject *v;
1921 Py_UNICODE *p, *buf;
1922 const char *end;
1923 const char *bs;
1924
1925 /* Escaped strings will always be longer than the resulting
1926 Unicode string, so we start with size here and then reduce the
1927 length after conversion to the true value. */
1928 v = _PyUnicode_New(size);
1929 if (v == NULL)
1930 goto onError;
1931 if (size == 0)
1932 return (PyObject *)v;
1933 p = buf = PyUnicode_AS_UNICODE(v);
1934 end = s + size;
1935 while (s < end) {
1936 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001937 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938 int i;
1939
1940 /* Non-escape characters are interpreted as Unicode ordinals */
1941 if (*s != '\\') {
1942 *p++ = (unsigned char)*s++;
1943 continue;
1944 }
1945
1946 /* \u-escapes are only interpreted iff the number of leading
1947 backslashes if odd */
1948 bs = s;
1949 for (;s < end;) {
1950 if (*s != '\\')
1951 break;
1952 *p++ = (unsigned char)*s++;
1953 }
1954 if (((s - bs) & 1) == 0 ||
1955 s >= end ||
1956 *s != 'u') {
1957 continue;
1958 }
1959 p--;
1960 s++;
1961
1962 /* \uXXXX with 4 hex digits */
1963 for (x = 0, i = 0; i < 4; i++) {
1964 c = (unsigned char)s[i];
1965 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001966 if (unicodeescape_decoding_error(&p, errors,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967 "truncated \\uXXXX"))
1968 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001969 x = 0xffffffff;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970 i++;
1971 break;
1972 }
1973 x = (x<<4) & ~0xF;
1974 if (c >= '0' && c <= '9')
1975 x += c - '0';
1976 else if (c >= 'a' && c <= 'f')
1977 x += 10 + c - 'a';
1978 else
1979 x += 10 + c - 'A';
1980 }
1981 s += i;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001982 if (x != 0xffffffff)
1983 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001985 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001986 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 return (PyObject *)v;
1988
1989 onError:
1990 Py_XDECREF(v);
1991 return NULL;
1992}
1993
1994PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1995 int size)
1996{
1997 PyObject *repr;
1998 char *p;
1999 char *q;
2000
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002001 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002002
2003 repr = PyString_FromStringAndSize(NULL, 6 * size);
2004 if (repr == NULL)
2005 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002006 if (size == 0)
2007 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008
2009 p = q = PyString_AS_STRING(repr);
2010 while (size-- > 0) {
2011 Py_UNICODE ch = *s++;
2012 /* Map 16-bit characters to '\uxxxx' */
2013 if (ch >= 256) {
2014 *p++ = '\\';
2015 *p++ = 'u';
2016 *p++ = hexdigit[(ch >> 12) & 0xf];
2017 *p++ = hexdigit[(ch >> 8) & 0xf];
2018 *p++ = hexdigit[(ch >> 4) & 0xf];
2019 *p++ = hexdigit[ch & 15];
2020 }
2021 /* Copy everything else as-is */
2022 else
2023 *p++ = (char) ch;
2024 }
2025 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002026 if (_PyString_Resize(&repr, p - q))
2027 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002028
2029 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002030
2031 onError:
2032 Py_DECREF(repr);
2033 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002034}
2035
2036PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2037{
2038 if (!PyUnicode_Check(unicode)) {
2039 PyErr_BadArgument();
2040 return NULL;
2041 }
2042 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2043 PyUnicode_GET_SIZE(unicode));
2044}
2045
2046/* --- Latin-1 Codec ------------------------------------------------------ */
2047
2048PyObject *PyUnicode_DecodeLatin1(const char *s,
2049 int size,
2050 const char *errors)
2051{
2052 PyUnicodeObject *v;
2053 Py_UNICODE *p;
2054
2055 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002056 if (size == 1 && *(unsigned char*)s < 256) {
2057 Py_UNICODE r = *(unsigned char*)s;
2058 return PyUnicode_FromUnicode(&r, 1);
2059 }
2060
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061 v = _PyUnicode_New(size);
2062 if (v == NULL)
2063 goto onError;
2064 if (size == 0)
2065 return (PyObject *)v;
2066 p = PyUnicode_AS_UNICODE(v);
2067 while (size-- > 0)
2068 *p++ = (unsigned char)*s++;
2069 return (PyObject *)v;
2070
2071 onError:
2072 Py_XDECREF(v);
2073 return NULL;
2074}
2075
2076static
2077int latin1_encoding_error(const Py_UNICODE **source,
2078 char **dest,
2079 const char *errors,
2080 const char *details)
2081{
2082 if ((errors == NULL) ||
2083 (strcmp(errors,"strict") == 0)) {
2084 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002085 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086 details);
2087 return -1;
2088 }
2089 else if (strcmp(errors,"ignore") == 0) {
2090 return 0;
2091 }
2092 else if (strcmp(errors,"replace") == 0) {
2093 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002094 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002095 return 0;
2096 }
2097 else {
2098 PyErr_Format(PyExc_ValueError,
2099 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002100 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002101 errors);
2102 return -1;
2103 }
2104}
2105
2106PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2107 int size,
2108 const char *errors)
2109{
2110 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002111 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002112
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113 repr = PyString_FromStringAndSize(NULL, size);
2114 if (repr == NULL)
2115 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002116 if (size == 0)
2117 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118
2119 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002120 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 while (size-- > 0) {
2122 Py_UNICODE ch = *p++;
2123 if (ch >= 256) {
2124 if (latin1_encoding_error(&p, &s, errors,
2125 "ordinal not in range(256)"))
2126 goto onError;
2127 }
2128 else
2129 *s++ = (char)ch;
2130 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002131 /* Resize if error handling skipped some characters */
2132 if (s - start < PyString_GET_SIZE(repr))
2133 if (_PyString_Resize(&repr, s - start))
2134 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 return repr;
2136
2137 onError:
2138 Py_DECREF(repr);
2139 return NULL;
2140}
2141
2142PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2143{
2144 if (!PyUnicode_Check(unicode)) {
2145 PyErr_BadArgument();
2146 return NULL;
2147 }
2148 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2149 PyUnicode_GET_SIZE(unicode),
2150 NULL);
2151}
2152
2153/* --- 7-bit ASCII Codec -------------------------------------------------- */
2154
2155static
2156int ascii_decoding_error(const char **source,
2157 Py_UNICODE **dest,
2158 const char *errors,
2159 const char *details)
2160{
2161 if ((errors == NULL) ||
2162 (strcmp(errors,"strict") == 0)) {
2163 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002164 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 details);
2166 return -1;
2167 }
2168 else if (strcmp(errors,"ignore") == 0) {
2169 return 0;
2170 }
2171 else if (strcmp(errors,"replace") == 0) {
2172 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2173 (*dest)++;
2174 return 0;
2175 }
2176 else {
2177 PyErr_Format(PyExc_ValueError,
2178 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002179 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 errors);
2181 return -1;
2182 }
2183}
2184
2185PyObject *PyUnicode_DecodeASCII(const char *s,
2186 int size,
2187 const char *errors)
2188{
2189 PyUnicodeObject *v;
2190 Py_UNICODE *p;
2191
2192 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002193 if (size == 1 && *(unsigned char*)s < 128) {
2194 Py_UNICODE r = *(unsigned char*)s;
2195 return PyUnicode_FromUnicode(&r, 1);
2196 }
2197
Guido van Rossumd57fd912000-03-10 22:53:23 +00002198 v = _PyUnicode_New(size);
2199 if (v == NULL)
2200 goto onError;
2201 if (size == 0)
2202 return (PyObject *)v;
2203 p = PyUnicode_AS_UNICODE(v);
2204 while (size-- > 0) {
2205 register unsigned char c;
2206
2207 c = (unsigned char)*s++;
2208 if (c < 128)
2209 *p++ = c;
2210 else if (ascii_decoding_error(&s, &p, errors,
2211 "ordinal not in range(128)"))
2212 goto onError;
2213 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002214 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002215 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002216 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 return (PyObject *)v;
2218
2219 onError:
2220 Py_XDECREF(v);
2221 return NULL;
2222}
2223
2224static
2225int ascii_encoding_error(const Py_UNICODE **source,
2226 char **dest,
2227 const char *errors,
2228 const char *details)
2229{
2230 if ((errors == NULL) ||
2231 (strcmp(errors,"strict") == 0)) {
2232 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002233 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234 details);
2235 return -1;
2236 }
2237 else if (strcmp(errors,"ignore") == 0) {
2238 return 0;
2239 }
2240 else if (strcmp(errors,"replace") == 0) {
2241 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002242 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002243 return 0;
2244 }
2245 else {
2246 PyErr_Format(PyExc_ValueError,
2247 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002248 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249 errors);
2250 return -1;
2251 }
2252}
2253
2254PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2255 int size,
2256 const char *errors)
2257{
2258 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002259 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002260
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 repr = PyString_FromStringAndSize(NULL, size);
2262 if (repr == NULL)
2263 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002264 if (size == 0)
2265 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002266
2267 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002268 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269 while (size-- > 0) {
2270 Py_UNICODE ch = *p++;
2271 if (ch >= 128) {
2272 if (ascii_encoding_error(&p, &s, errors,
2273 "ordinal not in range(128)"))
2274 goto onError;
2275 }
2276 else
2277 *s++ = (char)ch;
2278 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002279 /* Resize if error handling skipped some characters */
2280 if (s - start < PyString_GET_SIZE(repr))
2281 if (_PyString_Resize(&repr, s - start))
2282 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002283 return repr;
2284
2285 onError:
2286 Py_DECREF(repr);
2287 return NULL;
2288}
2289
2290PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2291{
2292 if (!PyUnicode_Check(unicode)) {
2293 PyErr_BadArgument();
2294 return NULL;
2295 }
2296 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2297 PyUnicode_GET_SIZE(unicode),
2298 NULL);
2299}
2300
Fredrik Lundh30831632001-06-26 15:11:00 +00002301#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002302
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002303/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002304
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002305PyObject *PyUnicode_DecodeMBCS(const char *s,
2306 int size,
2307 const char *errors)
2308{
2309 PyUnicodeObject *v;
2310 Py_UNICODE *p;
2311
2312 /* First get the size of the result */
2313 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002314 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002315 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2316
2317 v = _PyUnicode_New(usize);
2318 if (v == NULL)
2319 return NULL;
2320 if (usize == 0)
2321 return (PyObject *)v;
2322 p = PyUnicode_AS_UNICODE(v);
2323 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2324 Py_DECREF(v);
2325 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2326 }
2327
2328 return (PyObject *)v;
2329}
2330
2331PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2332 int size,
2333 const char *errors)
2334{
2335 PyObject *repr;
2336 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002337 DWORD mbcssize;
2338
2339 /* If there are no characters, bail now! */
2340 if (size==0)
2341 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002342
2343 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002344 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002345 if (mbcssize==0)
2346 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2347
2348 repr = PyString_FromStringAndSize(NULL, mbcssize);
2349 if (repr == NULL)
2350 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002351 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002352 return repr;
2353
2354 /* Do the conversion */
2355 s = PyString_AS_STRING(repr);
2356 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2357 Py_DECREF(repr);
2358 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2359 }
2360 return repr;
2361}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002362
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002363#endif /* MS_WIN32 */
2364
Guido van Rossumd57fd912000-03-10 22:53:23 +00002365/* --- Character Mapping Codec -------------------------------------------- */
2366
2367static
2368int charmap_decoding_error(const char **source,
2369 Py_UNICODE **dest,
2370 const char *errors,
2371 const char *details)
2372{
2373 if ((errors == NULL) ||
2374 (strcmp(errors,"strict") == 0)) {
2375 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002376 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002377 details);
2378 return -1;
2379 }
2380 else if (strcmp(errors,"ignore") == 0) {
2381 return 0;
2382 }
2383 else if (strcmp(errors,"replace") == 0) {
2384 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2385 (*dest)++;
2386 return 0;
2387 }
2388 else {
2389 PyErr_Format(PyExc_ValueError,
2390 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002391 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002392 errors);
2393 return -1;
2394 }
2395}
2396
2397PyObject *PyUnicode_DecodeCharmap(const char *s,
2398 int size,
2399 PyObject *mapping,
2400 const char *errors)
2401{
2402 PyUnicodeObject *v;
2403 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002404 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002405
2406 /* Default to Latin-1 */
2407 if (mapping == NULL)
2408 return PyUnicode_DecodeLatin1(s, size, errors);
2409
2410 v = _PyUnicode_New(size);
2411 if (v == NULL)
2412 goto onError;
2413 if (size == 0)
2414 return (PyObject *)v;
2415 p = PyUnicode_AS_UNICODE(v);
2416 while (size-- > 0) {
2417 unsigned char ch = *s++;
2418 PyObject *w, *x;
2419
2420 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2421 w = PyInt_FromLong((long)ch);
2422 if (w == NULL)
2423 goto onError;
2424 x = PyObject_GetItem(mapping, w);
2425 Py_DECREF(w);
2426 if (x == NULL) {
2427 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002428 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002429 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002430 x = Py_None;
2431 Py_INCREF(x);
2432 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002433 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002434 }
2435
2436 /* Apply mapping */
2437 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002438 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002439 if (value < 0 || value > 65535) {
2440 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002441 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002442 Py_DECREF(x);
2443 goto onError;
2444 }
2445 *p++ = (Py_UNICODE)value;
2446 }
2447 else if (x == Py_None) {
2448 /* undefined mapping */
2449 if (charmap_decoding_error(&s, &p, errors,
2450 "character maps to <undefined>")) {
2451 Py_DECREF(x);
2452 goto onError;
2453 }
2454 }
2455 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002456 int targetsize = PyUnicode_GET_SIZE(x);
2457
2458 if (targetsize == 1)
2459 /* 1-1 mapping */
2460 *p++ = *PyUnicode_AS_UNICODE(x);
2461
2462 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002463 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002464 if (targetsize > extrachars) {
2465 /* resize first */
2466 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2467 int needed = (targetsize - extrachars) + \
2468 (targetsize << 2);
2469 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002470 if (_PyUnicode_Resize(&v,
2471 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002472 Py_DECREF(x);
2473 goto onError;
2474 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002475 p = PyUnicode_AS_UNICODE(v) + oldpos;
2476 }
2477 Py_UNICODE_COPY(p,
2478 PyUnicode_AS_UNICODE(x),
2479 targetsize);
2480 p += targetsize;
2481 extrachars -= targetsize;
2482 }
2483 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 }
2485 else {
2486 /* wrong return value */
2487 PyErr_SetString(PyExc_TypeError,
2488 "character mapping must return integer, None or unicode");
2489 Py_DECREF(x);
2490 goto onError;
2491 }
2492 Py_DECREF(x);
2493 }
2494 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002495 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496 goto onError;
2497 return (PyObject *)v;
2498
2499 onError:
2500 Py_XDECREF(v);
2501 return NULL;
2502}
2503
2504static
2505int charmap_encoding_error(const Py_UNICODE **source,
2506 char **dest,
2507 const char *errors,
2508 const char *details)
2509{
2510 if ((errors == NULL) ||
2511 (strcmp(errors,"strict") == 0)) {
2512 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002513 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002514 details);
2515 return -1;
2516 }
2517 else if (strcmp(errors,"ignore") == 0) {
2518 return 0;
2519 }
2520 else if (strcmp(errors,"replace") == 0) {
2521 **dest = '?';
2522 (*dest)++;
2523 return 0;
2524 }
2525 else {
2526 PyErr_Format(PyExc_ValueError,
2527 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002528 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529 errors);
2530 return -1;
2531 }
2532}
2533
2534PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2535 int size,
2536 PyObject *mapping,
2537 const char *errors)
2538{
2539 PyObject *v;
2540 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002541 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002542
2543 /* Default to Latin-1 */
2544 if (mapping == NULL)
2545 return PyUnicode_EncodeLatin1(p, size, errors);
2546
2547 v = PyString_FromStringAndSize(NULL, size);
2548 if (v == NULL)
2549 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002550 if (size == 0)
2551 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552 s = PyString_AS_STRING(v);
2553 while (size-- > 0) {
2554 Py_UNICODE ch = *p++;
2555 PyObject *w, *x;
2556
2557 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2558 w = PyInt_FromLong((long)ch);
2559 if (w == NULL)
2560 goto onError;
2561 x = PyObject_GetItem(mapping, w);
2562 Py_DECREF(w);
2563 if (x == NULL) {
2564 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002565 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002567 x = Py_None;
2568 Py_INCREF(x);
2569 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002570 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002571 }
2572
2573 /* Apply mapping */
2574 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002575 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576 if (value < 0 || value > 255) {
2577 PyErr_SetString(PyExc_TypeError,
2578 "character mapping must be in range(256)");
2579 Py_DECREF(x);
2580 goto onError;
2581 }
2582 *s++ = (char)value;
2583 }
2584 else if (x == Py_None) {
2585 /* undefined mapping */
2586 if (charmap_encoding_error(&p, &s, errors,
2587 "character maps to <undefined>")) {
2588 Py_DECREF(x);
2589 goto onError;
2590 }
2591 }
2592 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002593 int targetsize = PyString_GET_SIZE(x);
2594
2595 if (targetsize == 1)
2596 /* 1-1 mapping */
2597 *s++ = *PyString_AS_STRING(x);
2598
2599 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002601 if (targetsize > extrachars) {
2602 /* resize first */
2603 int oldpos = (int)(s - PyString_AS_STRING(v));
2604 int needed = (targetsize - extrachars) + \
2605 (targetsize << 2);
2606 extrachars += needed;
2607 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002608 Py_DECREF(x);
2609 goto onError;
2610 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002611 s = PyString_AS_STRING(v) + oldpos;
2612 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002613 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002614 s += targetsize;
2615 extrachars -= targetsize;
2616 }
2617 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618 }
2619 else {
2620 /* wrong return value */
2621 PyErr_SetString(PyExc_TypeError,
2622 "character mapping must return integer, None or unicode");
2623 Py_DECREF(x);
2624 goto onError;
2625 }
2626 Py_DECREF(x);
2627 }
2628 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2629 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2630 goto onError;
2631 return v;
2632
2633 onError:
2634 Py_DECREF(v);
2635 return NULL;
2636}
2637
2638PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2639 PyObject *mapping)
2640{
2641 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2642 PyErr_BadArgument();
2643 return NULL;
2644 }
2645 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2646 PyUnicode_GET_SIZE(unicode),
2647 mapping,
2648 NULL);
2649}
2650
2651static
2652int translate_error(const Py_UNICODE **source,
2653 Py_UNICODE **dest,
2654 const char *errors,
2655 const char *details)
2656{
2657 if ((errors == NULL) ||
2658 (strcmp(errors,"strict") == 0)) {
2659 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002660 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661 details);
2662 return -1;
2663 }
2664 else if (strcmp(errors,"ignore") == 0) {
2665 return 0;
2666 }
2667 else if (strcmp(errors,"replace") == 0) {
2668 **dest = '?';
2669 (*dest)++;
2670 return 0;
2671 }
2672 else {
2673 PyErr_Format(PyExc_ValueError,
2674 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002675 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676 errors);
2677 return -1;
2678 }
2679}
2680
2681PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2682 int size,
2683 PyObject *mapping,
2684 const char *errors)
2685{
2686 PyUnicodeObject *v;
2687 Py_UNICODE *p;
2688
2689 if (mapping == NULL) {
2690 PyErr_BadArgument();
2691 return NULL;
2692 }
2693
2694 /* Output will never be longer than input */
2695 v = _PyUnicode_New(size);
2696 if (v == NULL)
2697 goto onError;
2698 if (size == 0)
2699 goto done;
2700 p = PyUnicode_AS_UNICODE(v);
2701 while (size-- > 0) {
2702 Py_UNICODE ch = *s++;
2703 PyObject *w, *x;
2704
2705 /* Get mapping */
2706 w = PyInt_FromLong(ch);
2707 if (w == NULL)
2708 goto onError;
2709 x = PyObject_GetItem(mapping, w);
2710 Py_DECREF(w);
2711 if (x == NULL) {
2712 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2713 /* No mapping found: default to 1-1 mapping */
2714 PyErr_Clear();
2715 *p++ = ch;
2716 continue;
2717 }
2718 goto onError;
2719 }
2720
2721 /* Apply mapping */
2722 if (PyInt_Check(x))
2723 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2724 else if (x == Py_None) {
2725 /* undefined mapping */
2726 if (translate_error(&s, &p, errors,
2727 "character maps to <undefined>")) {
2728 Py_DECREF(x);
2729 goto onError;
2730 }
2731 }
2732 else if (PyUnicode_Check(x)) {
2733 if (PyUnicode_GET_SIZE(x) != 1) {
2734 /* 1-n mapping */
2735 PyErr_SetString(PyExc_NotImplementedError,
2736 "1-n mappings are currently not implemented");
2737 Py_DECREF(x);
2738 goto onError;
2739 }
2740 *p++ = *PyUnicode_AS_UNICODE(x);
2741 }
2742 else {
2743 /* wrong return value */
2744 PyErr_SetString(PyExc_TypeError,
2745 "translate mapping must return integer, None or unicode");
2746 Py_DECREF(x);
2747 goto onError;
2748 }
2749 Py_DECREF(x);
2750 }
2751 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002752 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002753 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754
2755 done:
2756 return (PyObject *)v;
2757
2758 onError:
2759 Py_XDECREF(v);
2760 return NULL;
2761}
2762
2763PyObject *PyUnicode_Translate(PyObject *str,
2764 PyObject *mapping,
2765 const char *errors)
2766{
2767 PyObject *result;
2768
2769 str = PyUnicode_FromObject(str);
2770 if (str == NULL)
2771 goto onError;
2772 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2773 PyUnicode_GET_SIZE(str),
2774 mapping,
2775 errors);
2776 Py_DECREF(str);
2777 return result;
2778
2779 onError:
2780 Py_XDECREF(str);
2781 return NULL;
2782}
2783
Guido van Rossum9e896b32000-04-05 20:11:21 +00002784/* --- Decimal Encoder ---------------------------------------------------- */
2785
2786int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2787 int length,
2788 char *output,
2789 const char *errors)
2790{
2791 Py_UNICODE *p, *end;
2792
2793 if (output == NULL) {
2794 PyErr_BadArgument();
2795 return -1;
2796 }
2797
2798 p = s;
2799 end = s + length;
2800 while (p < end) {
2801 register Py_UNICODE ch = *p++;
2802 int decimal;
2803
2804 if (Py_UNICODE_ISSPACE(ch)) {
2805 *output++ = ' ';
2806 continue;
2807 }
2808 decimal = Py_UNICODE_TODECIMAL(ch);
2809 if (decimal >= 0) {
2810 *output++ = '0' + decimal;
2811 continue;
2812 }
Guido van Rossumba477042000-04-06 18:18:10 +00002813 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002814 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002815 continue;
2816 }
2817 /* All other characters are considered invalid */
2818 if (errors == NULL || strcmp(errors, "strict") == 0) {
2819 PyErr_SetString(PyExc_ValueError,
2820 "invalid decimal Unicode string");
2821 goto onError;
2822 }
2823 else if (strcmp(errors, "ignore") == 0)
2824 continue;
2825 else if (strcmp(errors, "replace") == 0) {
2826 *output++ = '?';
2827 continue;
2828 }
2829 }
2830 /* 0-terminate the output string */
2831 *output++ = '\0';
2832 return 0;
2833
2834 onError:
2835 return -1;
2836}
2837
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838/* --- Helpers ------------------------------------------------------------ */
2839
2840static
2841int count(PyUnicodeObject *self,
2842 int start,
2843 int end,
2844 PyUnicodeObject *substring)
2845{
2846 int count = 0;
2847
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002848 if (start < 0)
2849 start += self->length;
2850 if (start < 0)
2851 start = 0;
2852 if (end > self->length)
2853 end = self->length;
2854 if (end < 0)
2855 end += self->length;
2856 if (end < 0)
2857 end = 0;
2858
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002859 if (substring->length == 0)
2860 return (end - start + 1);
2861
Guido van Rossumd57fd912000-03-10 22:53:23 +00002862 end -= substring->length;
2863
2864 while (start <= end)
2865 if (Py_UNICODE_MATCH(self, start, substring)) {
2866 count++;
2867 start += substring->length;
2868 } else
2869 start++;
2870
2871 return count;
2872}
2873
2874int PyUnicode_Count(PyObject *str,
2875 PyObject *substr,
2876 int start,
2877 int end)
2878{
2879 int result;
2880
2881 str = PyUnicode_FromObject(str);
2882 if (str == NULL)
2883 return -1;
2884 substr = PyUnicode_FromObject(substr);
2885 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002886 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887 return -1;
2888 }
2889
2890 result = count((PyUnicodeObject *)str,
2891 start, end,
2892 (PyUnicodeObject *)substr);
2893
2894 Py_DECREF(str);
2895 Py_DECREF(substr);
2896 return result;
2897}
2898
2899static
2900int findstring(PyUnicodeObject *self,
2901 PyUnicodeObject *substring,
2902 int start,
2903 int end,
2904 int direction)
2905{
2906 if (start < 0)
2907 start += self->length;
2908 if (start < 0)
2909 start = 0;
2910
2911 if (substring->length == 0)
2912 return start;
2913
2914 if (end > self->length)
2915 end = self->length;
2916 if (end < 0)
2917 end += self->length;
2918 if (end < 0)
2919 end = 0;
2920
2921 end -= substring->length;
2922
2923 if (direction < 0) {
2924 for (; end >= start; end--)
2925 if (Py_UNICODE_MATCH(self, end, substring))
2926 return end;
2927 } else {
2928 for (; start <= end; start++)
2929 if (Py_UNICODE_MATCH(self, start, substring))
2930 return start;
2931 }
2932
2933 return -1;
2934}
2935
2936int PyUnicode_Find(PyObject *str,
2937 PyObject *substr,
2938 int start,
2939 int end,
2940 int direction)
2941{
2942 int result;
2943
2944 str = PyUnicode_FromObject(str);
2945 if (str == NULL)
2946 return -1;
2947 substr = PyUnicode_FromObject(substr);
2948 if (substr == NULL) {
2949 Py_DECREF(substr);
2950 return -1;
2951 }
2952
2953 result = findstring((PyUnicodeObject *)str,
2954 (PyUnicodeObject *)substr,
2955 start, end, direction);
2956 Py_DECREF(str);
2957 Py_DECREF(substr);
2958 return result;
2959}
2960
2961static
2962int tailmatch(PyUnicodeObject *self,
2963 PyUnicodeObject *substring,
2964 int start,
2965 int end,
2966 int direction)
2967{
2968 if (start < 0)
2969 start += self->length;
2970 if (start < 0)
2971 start = 0;
2972
2973 if (substring->length == 0)
2974 return 1;
2975
2976 if (end > self->length)
2977 end = self->length;
2978 if (end < 0)
2979 end += self->length;
2980 if (end < 0)
2981 end = 0;
2982
2983 end -= substring->length;
2984 if (end < start)
2985 return 0;
2986
2987 if (direction > 0) {
2988 if (Py_UNICODE_MATCH(self, end, substring))
2989 return 1;
2990 } else {
2991 if (Py_UNICODE_MATCH(self, start, substring))
2992 return 1;
2993 }
2994
2995 return 0;
2996}
2997
2998int PyUnicode_Tailmatch(PyObject *str,
2999 PyObject *substr,
3000 int start,
3001 int end,
3002 int direction)
3003{
3004 int result;
3005
3006 str = PyUnicode_FromObject(str);
3007 if (str == NULL)
3008 return -1;
3009 substr = PyUnicode_FromObject(substr);
3010 if (substr == NULL) {
3011 Py_DECREF(substr);
3012 return -1;
3013 }
3014
3015 result = tailmatch((PyUnicodeObject *)str,
3016 (PyUnicodeObject *)substr,
3017 start, end, direction);
3018 Py_DECREF(str);
3019 Py_DECREF(substr);
3020 return result;
3021}
3022
3023static
3024const Py_UNICODE *findchar(const Py_UNICODE *s,
3025 int size,
3026 Py_UNICODE ch)
3027{
3028 /* like wcschr, but doesn't stop at NULL characters */
3029
3030 while (size-- > 0) {
3031 if (*s == ch)
3032 return s;
3033 s++;
3034 }
3035
3036 return NULL;
3037}
3038
3039/* Apply fixfct filter to the Unicode object self and return a
3040 reference to the modified object */
3041
3042static
3043PyObject *fixup(PyUnicodeObject *self,
3044 int (*fixfct)(PyUnicodeObject *s))
3045{
3046
3047 PyUnicodeObject *u;
3048
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003049 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 if (u == NULL)
3051 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003052
3053 Py_UNICODE_COPY(u->str, self->str, self->length);
3054
Tim Peters7a29bd52001-09-12 03:03:31 +00003055 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 /* fixfct should return TRUE if it modified the buffer. If
3057 FALSE, return a reference to the original buffer instead
3058 (to save space, not time) */
3059 Py_INCREF(self);
3060 Py_DECREF(u);
3061 return (PyObject*) self;
3062 }
3063 return (PyObject*) u;
3064}
3065
3066static
3067int fixupper(PyUnicodeObject *self)
3068{
3069 int len = self->length;
3070 Py_UNICODE *s = self->str;
3071 int status = 0;
3072
3073 while (len-- > 0) {
3074 register Py_UNICODE ch;
3075
3076 ch = Py_UNICODE_TOUPPER(*s);
3077 if (ch != *s) {
3078 status = 1;
3079 *s = ch;
3080 }
3081 s++;
3082 }
3083
3084 return status;
3085}
3086
3087static
3088int fixlower(PyUnicodeObject *self)
3089{
3090 int len = self->length;
3091 Py_UNICODE *s = self->str;
3092 int status = 0;
3093
3094 while (len-- > 0) {
3095 register Py_UNICODE ch;
3096
3097 ch = Py_UNICODE_TOLOWER(*s);
3098 if (ch != *s) {
3099 status = 1;
3100 *s = ch;
3101 }
3102 s++;
3103 }
3104
3105 return status;
3106}
3107
3108static
3109int fixswapcase(PyUnicodeObject *self)
3110{
3111 int len = self->length;
3112 Py_UNICODE *s = self->str;
3113 int status = 0;
3114
3115 while (len-- > 0) {
3116 if (Py_UNICODE_ISUPPER(*s)) {
3117 *s = Py_UNICODE_TOLOWER(*s);
3118 status = 1;
3119 } else if (Py_UNICODE_ISLOWER(*s)) {
3120 *s = Py_UNICODE_TOUPPER(*s);
3121 status = 1;
3122 }
3123 s++;
3124 }
3125
3126 return status;
3127}
3128
3129static
3130int fixcapitalize(PyUnicodeObject *self)
3131{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003132 int len = self->length;
3133 Py_UNICODE *s = self->str;
3134 int status = 0;
3135
3136 if (len == 0)
3137 return 0;
3138 if (Py_UNICODE_ISLOWER(*s)) {
3139 *s = Py_UNICODE_TOUPPER(*s);
3140 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003142 s++;
3143 while (--len > 0) {
3144 if (Py_UNICODE_ISUPPER(*s)) {
3145 *s = Py_UNICODE_TOLOWER(*s);
3146 status = 1;
3147 }
3148 s++;
3149 }
3150 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151}
3152
3153static
3154int fixtitle(PyUnicodeObject *self)
3155{
3156 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3157 register Py_UNICODE *e;
3158 int previous_is_cased;
3159
3160 /* Shortcut for single character strings */
3161 if (PyUnicode_GET_SIZE(self) == 1) {
3162 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3163 if (*p != ch) {
3164 *p = ch;
3165 return 1;
3166 }
3167 else
3168 return 0;
3169 }
3170
3171 e = p + PyUnicode_GET_SIZE(self);
3172 previous_is_cased = 0;
3173 for (; p < e; p++) {
3174 register const Py_UNICODE ch = *p;
3175
3176 if (previous_is_cased)
3177 *p = Py_UNICODE_TOLOWER(ch);
3178 else
3179 *p = Py_UNICODE_TOTITLE(ch);
3180
3181 if (Py_UNICODE_ISLOWER(ch) ||
3182 Py_UNICODE_ISUPPER(ch) ||
3183 Py_UNICODE_ISTITLE(ch))
3184 previous_is_cased = 1;
3185 else
3186 previous_is_cased = 0;
3187 }
3188 return 1;
3189}
3190
3191PyObject *PyUnicode_Join(PyObject *separator,
3192 PyObject *seq)
3193{
3194 Py_UNICODE *sep;
3195 int seplen;
3196 PyUnicodeObject *res = NULL;
3197 int reslen = 0;
3198 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 int sz = 100;
3200 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003201 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202
Tim Peters2cfe3682001-05-05 05:36:48 +00003203 it = PyObject_GetIter(seq);
3204 if (it == NULL)
3205 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206
3207 if (separator == NULL) {
3208 Py_UNICODE blank = ' ';
3209 sep = &blank;
3210 seplen = 1;
3211 }
3212 else {
3213 separator = PyUnicode_FromObject(separator);
3214 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003215 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003216 sep = PyUnicode_AS_UNICODE(separator);
3217 seplen = PyUnicode_GET_SIZE(separator);
3218 }
3219
3220 res = _PyUnicode_New(sz);
3221 if (res == NULL)
3222 goto onError;
3223 p = PyUnicode_AS_UNICODE(res);
3224 reslen = 0;
3225
Tim Peters2cfe3682001-05-05 05:36:48 +00003226 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003228 PyObject *item = PyIter_Next(it);
3229 if (item == NULL) {
3230 if (PyErr_Occurred())
3231 goto onError;
3232 break;
3233 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 if (!PyUnicode_Check(item)) {
3235 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003236 if (!PyString_Check(item)) {
3237 PyErr_Format(PyExc_TypeError,
3238 "sequence item %i: expected string or Unicode,"
3239 " %.80s found",
3240 i, item->ob_type->tp_name);
3241 Py_DECREF(item);
3242 goto onError;
3243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 v = PyUnicode_FromObject(item);
3245 Py_DECREF(item);
3246 item = v;
3247 if (item == NULL)
3248 goto onError;
3249 }
3250 itemlen = PyUnicode_GET_SIZE(item);
3251 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003252 if (_PyUnicode_Resize(&res, sz*2)) {
3253 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256 sz *= 2;
3257 p = PyUnicode_AS_UNICODE(res) + reslen;
3258 }
3259 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003260 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 p += seplen;
3262 reslen += seplen;
3263 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003264 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 p += itemlen;
3266 reslen += itemlen;
3267 Py_DECREF(item);
3268 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003269 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270 goto onError;
3271
3272 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003273 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003274 return (PyObject *)res;
3275
3276 onError:
3277 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003278 Py_XDECREF(res);
3279 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003280 return NULL;
3281}
3282
3283static
3284PyUnicodeObject *pad(PyUnicodeObject *self,
3285 int left,
3286 int right,
3287 Py_UNICODE fill)
3288{
3289 PyUnicodeObject *u;
3290
3291 if (left < 0)
3292 left = 0;
3293 if (right < 0)
3294 right = 0;
3295
Tim Peters7a29bd52001-09-12 03:03:31 +00003296 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003297 Py_INCREF(self);
3298 return self;
3299 }
3300
3301 u = _PyUnicode_New(left + self->length + right);
3302 if (u) {
3303 if (left)
3304 Py_UNICODE_FILL(u->str, fill, left);
3305 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3306 if (right)
3307 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3308 }
3309
3310 return u;
3311}
3312
3313#define SPLIT_APPEND(data, left, right) \
3314 str = PyUnicode_FromUnicode(data + left, right - left); \
3315 if (!str) \
3316 goto onError; \
3317 if (PyList_Append(list, str)) { \
3318 Py_DECREF(str); \
3319 goto onError; \
3320 } \
3321 else \
3322 Py_DECREF(str);
3323
3324static
3325PyObject *split_whitespace(PyUnicodeObject *self,
3326 PyObject *list,
3327 int maxcount)
3328{
3329 register int i;
3330 register int j;
3331 int len = self->length;
3332 PyObject *str;
3333
3334 for (i = j = 0; i < len; ) {
3335 /* find a token */
3336 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3337 i++;
3338 j = i;
3339 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3340 i++;
3341 if (j < i) {
3342 if (maxcount-- <= 0)
3343 break;
3344 SPLIT_APPEND(self->str, j, i);
3345 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3346 i++;
3347 j = i;
3348 }
3349 }
3350 if (j < len) {
3351 SPLIT_APPEND(self->str, j, len);
3352 }
3353 return list;
3354
3355 onError:
3356 Py_DECREF(list);
3357 return NULL;
3358}
3359
3360PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003361 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362{
3363 register int i;
3364 register int j;
3365 int len;
3366 PyObject *list;
3367 PyObject *str;
3368 Py_UNICODE *data;
3369
3370 string = PyUnicode_FromObject(string);
3371 if (string == NULL)
3372 return NULL;
3373 data = PyUnicode_AS_UNICODE(string);
3374 len = PyUnicode_GET_SIZE(string);
3375
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376 list = PyList_New(0);
3377 if (!list)
3378 goto onError;
3379
3380 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003381 int eol;
3382
Guido van Rossumd57fd912000-03-10 22:53:23 +00003383 /* Find a line and append it */
3384 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3385 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003386
3387 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003388 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003389 if (i < len) {
3390 if (data[i] == '\r' && i + 1 < len &&
3391 data[i+1] == '\n')
3392 i += 2;
3393 else
3394 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003395 if (keepends)
3396 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397 }
Guido van Rossum86662912000-04-11 15:38:46 +00003398 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003399 j = i;
3400 }
3401 if (j < len) {
3402 SPLIT_APPEND(data, j, len);
3403 }
3404
3405 Py_DECREF(string);
3406 return list;
3407
3408 onError:
3409 Py_DECREF(list);
3410 Py_DECREF(string);
3411 return NULL;
3412}
3413
3414static
3415PyObject *split_char(PyUnicodeObject *self,
3416 PyObject *list,
3417 Py_UNICODE ch,
3418 int maxcount)
3419{
3420 register int i;
3421 register int j;
3422 int len = self->length;
3423 PyObject *str;
3424
3425 for (i = j = 0; i < len; ) {
3426 if (self->str[i] == ch) {
3427 if (maxcount-- <= 0)
3428 break;
3429 SPLIT_APPEND(self->str, j, i);
3430 i = j = i + 1;
3431 } else
3432 i++;
3433 }
3434 if (j <= len) {
3435 SPLIT_APPEND(self->str, j, len);
3436 }
3437 return list;
3438
3439 onError:
3440 Py_DECREF(list);
3441 return NULL;
3442}
3443
3444static
3445PyObject *split_substring(PyUnicodeObject *self,
3446 PyObject *list,
3447 PyUnicodeObject *substring,
3448 int maxcount)
3449{
3450 register int i;
3451 register int j;
3452 int len = self->length;
3453 int sublen = substring->length;
3454 PyObject *str;
3455
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003456 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457 if (Py_UNICODE_MATCH(self, i, substring)) {
3458 if (maxcount-- <= 0)
3459 break;
3460 SPLIT_APPEND(self->str, j, i);
3461 i = j = i + sublen;
3462 } else
3463 i++;
3464 }
3465 if (j <= len) {
3466 SPLIT_APPEND(self->str, j, len);
3467 }
3468 return list;
3469
3470 onError:
3471 Py_DECREF(list);
3472 return NULL;
3473}
3474
3475#undef SPLIT_APPEND
3476
3477static
3478PyObject *split(PyUnicodeObject *self,
3479 PyUnicodeObject *substring,
3480 int maxcount)
3481{
3482 PyObject *list;
3483
3484 if (maxcount < 0)
3485 maxcount = INT_MAX;
3486
3487 list = PyList_New(0);
3488 if (!list)
3489 return NULL;
3490
3491 if (substring == NULL)
3492 return split_whitespace(self,list,maxcount);
3493
3494 else if (substring->length == 1)
3495 return split_char(self,list,substring->str[0],maxcount);
3496
3497 else if (substring->length == 0) {
3498 Py_DECREF(list);
3499 PyErr_SetString(PyExc_ValueError, "empty separator");
3500 return NULL;
3501 }
3502 else
3503 return split_substring(self,list,substring,maxcount);
3504}
3505
3506static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003507PyObject *replace(PyUnicodeObject *self,
3508 PyUnicodeObject *str1,
3509 PyUnicodeObject *str2,
3510 int maxcount)
3511{
3512 PyUnicodeObject *u;
3513
3514 if (maxcount < 0)
3515 maxcount = INT_MAX;
3516
3517 if (str1->length == 1 && str2->length == 1) {
3518 int i;
3519
3520 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003521 if (!findchar(self->str, self->length, str1->str[0]) &&
3522 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003523 /* nothing to replace, return original string */
3524 Py_INCREF(self);
3525 u = self;
3526 } else {
3527 Py_UNICODE u1 = str1->str[0];
3528 Py_UNICODE u2 = str2->str[0];
3529
3530 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003531 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003532 self->length
3533 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003534 if (u != NULL) {
3535 Py_UNICODE_COPY(u->str, self->str,
3536 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537 for (i = 0; i < u->length; i++)
3538 if (u->str[i] == u1) {
3539 if (--maxcount < 0)
3540 break;
3541 u->str[i] = u2;
3542 }
3543 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003544 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003545
3546 } else {
3547 int n, i;
3548 Py_UNICODE *p;
3549
3550 /* replace strings */
3551 n = count(self, 0, self->length, str1);
3552 if (n > maxcount)
3553 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003554 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555 /* nothing to replace, return original string */
3556 Py_INCREF(self);
3557 u = self;
3558 } else {
3559 u = _PyUnicode_New(
3560 self->length + n * (str2->length - str1->length));
3561 if (u) {
3562 i = 0;
3563 p = u->str;
3564 while (i <= self->length - str1->length)
3565 if (Py_UNICODE_MATCH(self, i, str1)) {
3566 /* replace string segment */
3567 Py_UNICODE_COPY(p, str2->str, str2->length);
3568 p += str2->length;
3569 i += str1->length;
3570 if (--n <= 0) {
3571 /* copy remaining part */
3572 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3573 break;
3574 }
3575 } else
3576 *p++ = self->str[i++];
3577 }
3578 }
3579 }
3580
3581 return (PyObject *) u;
3582}
3583
3584/* --- Unicode Object Methods --------------------------------------------- */
3585
3586static char title__doc__[] =
3587"S.title() -> unicode\n\
3588\n\
3589Return a titlecased version of S, i.e. words start with title case\n\
3590characters, all remaining cased characters have lower case.";
3591
3592static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003593unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003595 return fixup(self, fixtitle);
3596}
3597
3598static char capitalize__doc__[] =
3599"S.capitalize() -> unicode\n\
3600\n\
3601Return a capitalized version of S, i.e. make the first character\n\
3602have upper case.";
3603
3604static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003605unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003606{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607 return fixup(self, fixcapitalize);
3608}
3609
3610#if 0
3611static char capwords__doc__[] =
3612"S.capwords() -> unicode\n\
3613\n\
3614Apply .capitalize() to all words in S and return the result with\n\
3615normalized whitespace (all whitespace strings are replaced by ' ').";
3616
3617static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003618unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003619{
3620 PyObject *list;
3621 PyObject *item;
3622 int i;
3623
Guido van Rossumd57fd912000-03-10 22:53:23 +00003624 /* Split into words */
3625 list = split(self, NULL, -1);
3626 if (!list)
3627 return NULL;
3628
3629 /* Capitalize each word */
3630 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3631 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3632 fixcapitalize);
3633 if (item == NULL)
3634 goto onError;
3635 Py_DECREF(PyList_GET_ITEM(list, i));
3636 PyList_SET_ITEM(list, i, item);
3637 }
3638
3639 /* Join the words to form a new string */
3640 item = PyUnicode_Join(NULL, list);
3641
3642onError:
3643 Py_DECREF(list);
3644 return (PyObject *)item;
3645}
3646#endif
3647
3648static char center__doc__[] =
3649"S.center(width) -> unicode\n\
3650\n\
3651Return S centered in a Unicode string of length width. Padding is done\n\
3652using spaces.";
3653
3654static PyObject *
3655unicode_center(PyUnicodeObject *self, PyObject *args)
3656{
3657 int marg, left;
3658 int width;
3659
3660 if (!PyArg_ParseTuple(args, "i:center", &width))
3661 return NULL;
3662
Tim Peters7a29bd52001-09-12 03:03:31 +00003663 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664 Py_INCREF(self);
3665 return (PyObject*) self;
3666 }
3667
3668 marg = width - self->length;
3669 left = marg / 2 + (marg & width & 1);
3670
3671 return (PyObject*) pad(self, left, marg - left, ' ');
3672}
3673
Marc-André Lemburge5034372000-08-08 08:04:29 +00003674#if 0
3675
3676/* This code should go into some future Unicode collation support
3677 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003678 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003679
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003680/* speedy UTF-16 code point order comparison */
3681/* gleaned from: */
3682/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3683
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003684static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003685{
3686 0, 0, 0, 0, 0, 0, 0, 0,
3687 0, 0, 0, 0, 0, 0, 0, 0,
3688 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003689 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003690};
3691
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692static int
3693unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3694{
3695 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003696
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697 Py_UNICODE *s1 = str1->str;
3698 Py_UNICODE *s2 = str2->str;
3699
3700 len1 = str1->length;
3701 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003702
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003704 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003705
3706 c1 = *s1++;
3707 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003708
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003709 if (c1 > (1<<11) * 26)
3710 c1 += utf16Fixup[c1>>11];
3711 if (c2 > (1<<11) * 26)
3712 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003713 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003714
3715 if (c1 != c2)
3716 return (c1 < c2) ? -1 : 1;
3717
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003718 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003719 }
3720
3721 return (len1 < len2) ? -1 : (len1 != len2);
3722}
3723
Marc-André Lemburge5034372000-08-08 08:04:29 +00003724#else
3725
3726static int
3727unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3728{
3729 register int len1, len2;
3730
3731 Py_UNICODE *s1 = str1->str;
3732 Py_UNICODE *s2 = str2->str;
3733
3734 len1 = str1->length;
3735 len2 = str2->length;
3736
3737 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003738 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003739
Fredrik Lundh45714e92001-06-26 16:39:36 +00003740 c1 = *s1++;
3741 c2 = *s2++;
3742
3743 if (c1 != c2)
3744 return (c1 < c2) ? -1 : 1;
3745
Marc-André Lemburge5034372000-08-08 08:04:29 +00003746 len1--; len2--;
3747 }
3748
3749 return (len1 < len2) ? -1 : (len1 != len2);
3750}
3751
3752#endif
3753
Guido van Rossumd57fd912000-03-10 22:53:23 +00003754int PyUnicode_Compare(PyObject *left,
3755 PyObject *right)
3756{
3757 PyUnicodeObject *u = NULL, *v = NULL;
3758 int result;
3759
3760 /* Coerce the two arguments */
3761 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3762 if (u == NULL)
3763 goto onError;
3764 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3765 if (v == NULL)
3766 goto onError;
3767
Thomas Wouters7e474022000-07-16 12:04:32 +00003768 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003769 if (v == u) {
3770 Py_DECREF(u);
3771 Py_DECREF(v);
3772 return 0;
3773 }
3774
3775 result = unicode_compare(u, v);
3776
3777 Py_DECREF(u);
3778 Py_DECREF(v);
3779 return result;
3780
3781onError:
3782 Py_XDECREF(u);
3783 Py_XDECREF(v);
3784 return -1;
3785}
3786
Guido van Rossum403d68b2000-03-13 15:55:09 +00003787int PyUnicode_Contains(PyObject *container,
3788 PyObject *element)
3789{
3790 PyUnicodeObject *u = NULL, *v = NULL;
3791 int result;
3792 register const Py_UNICODE *p, *e;
3793 register Py_UNICODE ch;
3794
3795 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003796 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003797 if (v == NULL) {
3798 PyErr_SetString(PyExc_TypeError,
3799 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003800 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003801 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003802 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3803 if (u == NULL) {
3804 Py_DECREF(v);
3805 goto onError;
3806 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003807
3808 /* Check v in u */
3809 if (PyUnicode_GET_SIZE(v) != 1) {
3810 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003811 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003812 goto onError;
3813 }
3814 ch = *PyUnicode_AS_UNICODE(v);
3815 p = PyUnicode_AS_UNICODE(u);
3816 e = p + PyUnicode_GET_SIZE(u);
3817 result = 0;
3818 while (p < e) {
3819 if (*p++ == ch) {
3820 result = 1;
3821 break;
3822 }
3823 }
3824
3825 Py_DECREF(u);
3826 Py_DECREF(v);
3827 return result;
3828
3829onError:
3830 Py_XDECREF(u);
3831 Py_XDECREF(v);
3832 return -1;
3833}
3834
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835/* Concat to string or Unicode object giving a new Unicode object. */
3836
3837PyObject *PyUnicode_Concat(PyObject *left,
3838 PyObject *right)
3839{
3840 PyUnicodeObject *u = NULL, *v = NULL, *w;
3841
3842 /* Coerce the two arguments */
3843 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3844 if (u == NULL)
3845 goto onError;
3846 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3847 if (v == NULL)
3848 goto onError;
3849
3850 /* Shortcuts */
3851 if (v == unicode_empty) {
3852 Py_DECREF(v);
3853 return (PyObject *)u;
3854 }
3855 if (u == unicode_empty) {
3856 Py_DECREF(u);
3857 return (PyObject *)v;
3858 }
3859
3860 /* Concat the two Unicode strings */
3861 w = _PyUnicode_New(u->length + v->length);
3862 if (w == NULL)
3863 goto onError;
3864 Py_UNICODE_COPY(w->str, u->str, u->length);
3865 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3866
3867 Py_DECREF(u);
3868 Py_DECREF(v);
3869 return (PyObject *)w;
3870
3871onError:
3872 Py_XDECREF(u);
3873 Py_XDECREF(v);
3874 return NULL;
3875}
3876
3877static char count__doc__[] =
3878"S.count(sub[, start[, end]]) -> int\n\
3879\n\
3880Return the number of occurrences of substring sub in Unicode string\n\
3881S[start:end]. Optional arguments start and end are\n\
3882interpreted as in slice notation.";
3883
3884static PyObject *
3885unicode_count(PyUnicodeObject *self, PyObject *args)
3886{
3887 PyUnicodeObject *substring;
3888 int start = 0;
3889 int end = INT_MAX;
3890 PyObject *result;
3891
Guido van Rossumb8872e62000-05-09 14:14:27 +00003892 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3893 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003894 return NULL;
3895
3896 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3897 (PyObject *)substring);
3898 if (substring == NULL)
3899 return NULL;
3900
Guido van Rossumd57fd912000-03-10 22:53:23 +00003901 if (start < 0)
3902 start += self->length;
3903 if (start < 0)
3904 start = 0;
3905 if (end > self->length)
3906 end = self->length;
3907 if (end < 0)
3908 end += self->length;
3909 if (end < 0)
3910 end = 0;
3911
3912 result = PyInt_FromLong((long) count(self, start, end, substring));
3913
3914 Py_DECREF(substring);
3915 return result;
3916}
3917
3918static char encode__doc__[] =
3919"S.encode([encoding[,errors]]) -> string\n\
3920\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003921Return an encoded string version of S. Default encoding is the current\n\
3922default string encoding. errors may be given to set a different error\n\
3923handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3924a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925
3926static PyObject *
3927unicode_encode(PyUnicodeObject *self, PyObject *args)
3928{
3929 char *encoding = NULL;
3930 char *errors = NULL;
3931 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3932 return NULL;
3933 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3934}
3935
3936static char expandtabs__doc__[] =
3937"S.expandtabs([tabsize]) -> unicode\n\
3938\n\
3939Return a copy of S where all tab characters are expanded using spaces.\n\
3940If tabsize is not given, a tab size of 8 characters is assumed.";
3941
3942static PyObject*
3943unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3944{
3945 Py_UNICODE *e;
3946 Py_UNICODE *p;
3947 Py_UNICODE *q;
3948 int i, j;
3949 PyUnicodeObject *u;
3950 int tabsize = 8;
3951
3952 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3953 return NULL;
3954
Thomas Wouters7e474022000-07-16 12:04:32 +00003955 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 i = j = 0;
3957 e = self->str + self->length;
3958 for (p = self->str; p < e; p++)
3959 if (*p == '\t') {
3960 if (tabsize > 0)
3961 j += tabsize - (j % tabsize);
3962 }
3963 else {
3964 j++;
3965 if (*p == '\n' || *p == '\r') {
3966 i += j;
3967 j = 0;
3968 }
3969 }
3970
3971 /* Second pass: create output string and fill it */
3972 u = _PyUnicode_New(i + j);
3973 if (!u)
3974 return NULL;
3975
3976 j = 0;
3977 q = u->str;
3978
3979 for (p = self->str; p < e; p++)
3980 if (*p == '\t') {
3981 if (tabsize > 0) {
3982 i = tabsize - (j % tabsize);
3983 j += i;
3984 while (i--)
3985 *q++ = ' ';
3986 }
3987 }
3988 else {
3989 j++;
3990 *q++ = *p;
3991 if (*p == '\n' || *p == '\r')
3992 j = 0;
3993 }
3994
3995 return (PyObject*) u;
3996}
3997
3998static char find__doc__[] =
3999"S.find(sub [,start [,end]]) -> int\n\
4000\n\
4001Return the lowest index in S where substring sub is found,\n\
4002such that sub is contained within s[start,end]. Optional\n\
4003arguments start and end are interpreted as in slice notation.\n\
4004\n\
4005Return -1 on failure.";
4006
4007static PyObject *
4008unicode_find(PyUnicodeObject *self, PyObject *args)
4009{
4010 PyUnicodeObject *substring;
4011 int start = 0;
4012 int end = INT_MAX;
4013 PyObject *result;
4014
Guido van Rossumb8872e62000-05-09 14:14:27 +00004015 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4016 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004017 return NULL;
4018 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4019 (PyObject *)substring);
4020 if (substring == NULL)
4021 return NULL;
4022
4023 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4024
4025 Py_DECREF(substring);
4026 return result;
4027}
4028
4029static PyObject *
4030unicode_getitem(PyUnicodeObject *self, int index)
4031{
4032 if (index < 0 || index >= self->length) {
4033 PyErr_SetString(PyExc_IndexError, "string index out of range");
4034 return NULL;
4035 }
4036
4037 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4038}
4039
4040static long
4041unicode_hash(PyUnicodeObject *self)
4042{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004043 /* Since Unicode objects compare equal to their ASCII string
4044 counterparts, they should use the individual character values
4045 as basis for their hash value. This is needed to assure that
4046 strings and Unicode objects behave in the same way as
4047 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048
Fredrik Lundhdde61642000-07-10 18:27:47 +00004049 register int len;
4050 register Py_UNICODE *p;
4051 register long x;
4052
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053 if (self->hash != -1)
4054 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004055 len = PyUnicode_GET_SIZE(self);
4056 p = PyUnicode_AS_UNICODE(self);
4057 x = *p << 7;
4058 while (--len >= 0)
4059 x = (1000003*x) ^ *p++;
4060 x ^= PyUnicode_GET_SIZE(self);
4061 if (x == -1)
4062 x = -2;
4063 self->hash = x;
4064 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065}
4066
4067static char index__doc__[] =
4068"S.index(sub [,start [,end]]) -> int\n\
4069\n\
4070Like S.find() but raise ValueError when the substring is not found.";
4071
4072static PyObject *
4073unicode_index(PyUnicodeObject *self, PyObject *args)
4074{
4075 int result;
4076 PyUnicodeObject *substring;
4077 int start = 0;
4078 int end = INT_MAX;
4079
Guido van Rossumb8872e62000-05-09 14:14:27 +00004080 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4081 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 return NULL;
4083
4084 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4085 (PyObject *)substring);
4086 if (substring == NULL)
4087 return NULL;
4088
4089 result = findstring(self, substring, start, end, 1);
4090
4091 Py_DECREF(substring);
4092 if (result < 0) {
4093 PyErr_SetString(PyExc_ValueError, "substring not found");
4094 return NULL;
4095 }
4096 return PyInt_FromLong(result);
4097}
4098
4099static char islower__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004100"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004102Return True if all cased characters in S are lowercase and there is\n\
4103at least one cased character in S, False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104
4105static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004106unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107{
4108 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4109 register const Py_UNICODE *e;
4110 int cased;
4111
Guido van Rossumd57fd912000-03-10 22:53:23 +00004112 /* Shortcut for single character strings */
4113 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004114 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004116 /* Special case for empty strings */
4117 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004118 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004119
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120 e = p + PyUnicode_GET_SIZE(self);
4121 cased = 0;
4122 for (; p < e; p++) {
4123 register const Py_UNICODE ch = *p;
4124
4125 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004126 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127 else if (!cased && Py_UNICODE_ISLOWER(ch))
4128 cased = 1;
4129 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004130 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131}
4132
4133static char isupper__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004134"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004136Return True if all cased characters in S are uppercase and there is\n\
4137at least one cased character in S, False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138
4139static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004140unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141{
4142 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4143 register const Py_UNICODE *e;
4144 int cased;
4145
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146 /* Shortcut for single character strings */
4147 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004148 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004150 /* Special case for empty strings */
4151 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004152 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004153
Guido van Rossumd57fd912000-03-10 22:53:23 +00004154 e = p + PyUnicode_GET_SIZE(self);
4155 cased = 0;
4156 for (; p < e; p++) {
4157 register const Py_UNICODE ch = *p;
4158
4159 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004160 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004161 else if (!cased && Py_UNICODE_ISUPPER(ch))
4162 cased = 1;
4163 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004164 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165}
4166
4167static char istitle__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004168"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004170Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4171characters may only follow uncased characters and lowercase characters\n\
4172only cased ones. Return False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173
4174static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004175unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176{
4177 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4178 register const Py_UNICODE *e;
4179 int cased, previous_is_cased;
4180
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181 /* Shortcut for single character strings */
4182 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004183 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4184 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004186 /* Special case for empty strings */
4187 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004188 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004189
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190 e = p + PyUnicode_GET_SIZE(self);
4191 cased = 0;
4192 previous_is_cased = 0;
4193 for (; p < e; p++) {
4194 register const Py_UNICODE ch = *p;
4195
4196 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4197 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004198 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 previous_is_cased = 1;
4200 cased = 1;
4201 }
4202 else if (Py_UNICODE_ISLOWER(ch)) {
4203 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004204 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205 previous_is_cased = 1;
4206 cased = 1;
4207 }
4208 else
4209 previous_is_cased = 0;
4210 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004211 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004212}
4213
4214static char isspace__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004215"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004217Return True if there are only whitespace characters in S,\n\
4218False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004219
4220static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004221unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004222{
4223 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4224 register const Py_UNICODE *e;
4225
Guido van Rossumd57fd912000-03-10 22:53:23 +00004226 /* Shortcut for single character strings */
4227 if (PyUnicode_GET_SIZE(self) == 1 &&
4228 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004229 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004230
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004231 /* Special case for empty strings */
4232 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004233 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004234
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235 e = p + PyUnicode_GET_SIZE(self);
4236 for (; p < e; p++) {
4237 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004238 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004239 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004240 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004241}
4242
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004243static char isalpha__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004244"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004245\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004246Return True if all characters in S are alphabetic\n\
4247and there is at least one character in S, False otherwise.";
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004248
4249static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004250unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004251{
4252 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4253 register const Py_UNICODE *e;
4254
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004255 /* Shortcut for single character strings */
4256 if (PyUnicode_GET_SIZE(self) == 1 &&
4257 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004258 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004259
4260 /* Special case for empty strings */
4261 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004262 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004263
4264 e = p + PyUnicode_GET_SIZE(self);
4265 for (; p < e; p++) {
4266 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004267 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004268 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004269 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004270}
4271
4272static char isalnum__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004273"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004274\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004275Return True if all characters in S are alphanumeric\n\
4276and there is at least one character in S, False otherwise.";
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004277
4278static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004279unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004280{
4281 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4282 register const Py_UNICODE *e;
4283
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004284 /* Shortcut for single character strings */
4285 if (PyUnicode_GET_SIZE(self) == 1 &&
4286 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004287 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004288
4289 /* Special case for empty strings */
4290 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004291 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004292
4293 e = p + PyUnicode_GET_SIZE(self);
4294 for (; p < e; p++) {
4295 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004296 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004297 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004298 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004299}
4300
Guido van Rossumd57fd912000-03-10 22:53:23 +00004301static char isdecimal__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004302"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004304Return True if there are only decimal characters in S,\n\
4305False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306
4307static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004308unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309{
4310 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4311 register const Py_UNICODE *e;
4312
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313 /* Shortcut for single character strings */
4314 if (PyUnicode_GET_SIZE(self) == 1 &&
4315 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004316 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004317
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004318 /* Special case for empty strings */
4319 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004320 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004321
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322 e = p + PyUnicode_GET_SIZE(self);
4323 for (; p < e; p++) {
4324 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004325 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004327 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328}
4329
4330static char isdigit__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004331"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004333Return True if there are only digit characters in S,\n\
4334False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004335
4336static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004337unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338{
4339 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4340 register const Py_UNICODE *e;
4341
Guido van Rossumd57fd912000-03-10 22:53:23 +00004342 /* Shortcut for single character strings */
4343 if (PyUnicode_GET_SIZE(self) == 1 &&
4344 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004345 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004346
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004347 /* Special case for empty strings */
4348 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004349 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004350
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351 e = p + PyUnicode_GET_SIZE(self);
4352 for (; p < e; p++) {
4353 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004354 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004355 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004356 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357}
4358
4359static char isnumeric__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004360"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004362Return True if there are only numeric characters in S,\n\
4363False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364
4365static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004366unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004367{
4368 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4369 register const Py_UNICODE *e;
4370
Guido van Rossumd57fd912000-03-10 22:53:23 +00004371 /* Shortcut for single character strings */
4372 if (PyUnicode_GET_SIZE(self) == 1 &&
4373 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004374 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004375
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004376 /* Special case for empty strings */
4377 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004378 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004379
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380 e = p + PyUnicode_GET_SIZE(self);
4381 for (; p < e; p++) {
4382 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004383 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004384 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004385 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386}
4387
4388static char join__doc__[] =
4389"S.join(sequence) -> unicode\n\
4390\n\
4391Return a string which is the concatenation of the strings in the\n\
4392sequence. The separator between elements is S.";
4393
4394static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004395unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004397 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398}
4399
4400static int
4401unicode_length(PyUnicodeObject *self)
4402{
4403 return self->length;
4404}
4405
4406static char ljust__doc__[] =
4407"S.ljust(width) -> unicode\n\
4408\n\
4409Return S left justified in a Unicode string of length width. Padding is\n\
4410done using spaces.";
4411
4412static PyObject *
4413unicode_ljust(PyUnicodeObject *self, PyObject *args)
4414{
4415 int width;
4416 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4417 return NULL;
4418
Tim Peters7a29bd52001-09-12 03:03:31 +00004419 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420 Py_INCREF(self);
4421 return (PyObject*) self;
4422 }
4423
4424 return (PyObject*) pad(self, 0, width - self->length, ' ');
4425}
4426
4427static char lower__doc__[] =
4428"S.lower() -> unicode\n\
4429\n\
4430Return a copy of the string S converted to lowercase.";
4431
4432static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004433unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435 return fixup(self, fixlower);
4436}
4437
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004438#define LEFTSTRIP 0
4439#define RIGHTSTRIP 1
4440#define BOTHSTRIP 2
4441
4442/* Arrays indexed by above */
4443static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
4444
4445#define STRIPNAME(i) (stripformat[i]+3)
4446
4447static const Py_UNICODE *
4448unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
4449{
4450 int i;
4451 for (i = 0; i<n; ++i)
4452 if (s[i]==c)
4453 return s+i;
4454 return NULL;
4455}
4456
4457/* externally visible for str.strip(unicode) */
4458PyObject *
4459_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
4460{
4461 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
4462 int len = PyUnicode_GET_SIZE(self);
4463 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
4464 int seplen = PyUnicode_GET_SIZE(sepobj);
4465 int i, j;
4466
4467 i = 0;
4468 if (striptype != RIGHTSTRIP) {
4469 while (i < len && unicode_memchr(sep, s[i], seplen)) {
4470 i++;
4471 }
4472 }
4473
4474 j = len;
4475 if (striptype != LEFTSTRIP) {
4476 do {
4477 j--;
4478 } while (j >= i && unicode_memchr(sep, s[j], seplen));
4479 j++;
4480 }
4481
4482 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
4483 Py_INCREF(self);
4484 return (PyObject*)self;
4485 }
4486 else
4487 return PyUnicode_FromUnicode(s+i, j-i);
4488}
4489
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490
4491static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004492do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004493{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004494 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
4495 int len = PyUnicode_GET_SIZE(self), i, j;
4496
4497 i = 0;
4498 if (striptype != RIGHTSTRIP) {
4499 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
4500 i++;
4501 }
4502 }
4503
4504 j = len;
4505 if (striptype != LEFTSTRIP) {
4506 do {
4507 j--;
4508 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
4509 j++;
4510 }
4511
4512 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
4513 Py_INCREF(self);
4514 return (PyObject*)self;
4515 }
4516 else
4517 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518}
4519
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004520
4521static PyObject *
4522do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
4523{
4524 PyObject *sep = NULL;
4525
4526 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
4527 return NULL;
4528
4529 if (sep != NULL && sep != Py_None) {
4530 if (PyUnicode_Check(sep))
4531 return _PyUnicode_XStrip(self, striptype, sep);
4532 else if (PyString_Check(sep)) {
4533 PyObject *res;
4534 sep = PyUnicode_FromObject(sep);
4535 if (sep==NULL)
4536 return NULL;
4537 res = _PyUnicode_XStrip(self, striptype, sep);
4538 Py_DECREF(sep);
4539 return res;
4540 }
4541 else {
4542 PyErr_Format(PyExc_TypeError,
4543 "%s arg must be None, unicode or str",
4544 STRIPNAME(striptype));
4545 return NULL;
4546 }
4547 }
4548
4549 return do_strip(self, striptype);
4550}
4551
4552
4553static char strip__doc__[] =
4554"S.strip([sep]) -> unicode\n\
4555\n\
4556Return a copy of the string S with leading and trailing\n\
4557whitespace removed.\n\
4558If sep is given and not None, remove characters in sep instead.\n\
4559If sep is a str, it will be converted to unicode before stripping";
4560
4561static PyObject *
4562unicode_strip(PyUnicodeObject *self, PyObject *args)
4563{
4564 if (PyTuple_GET_SIZE(args) == 0)
4565 return do_strip(self, BOTHSTRIP); /* Common case */
4566 else
4567 return do_argstrip(self, BOTHSTRIP, args);
4568}
4569
4570
4571static char lstrip__doc__[] =
4572"S.lstrip([sep]) -> unicode\n\
4573\n\
4574Return a copy of the string S with leading whitespace removed.\n\
4575If sep is given and not None, remove characters in sep instead.\n\
4576If sep is a str, it will be converted to unicode before stripping";
4577
4578static PyObject *
4579unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4580{
4581 if (PyTuple_GET_SIZE(args) == 0)
4582 return do_strip(self, LEFTSTRIP); /* Common case */
4583 else
4584 return do_argstrip(self, LEFTSTRIP, args);
4585}
4586
4587
4588static char rstrip__doc__[] =
4589"S.rstrip([sep]) -> unicode\n\
4590\n\
4591Return a copy of the string S with trailing whitespace removed.\n\
4592If sep is given and not None, remove characters in sep instead.\n\
4593If sep is a str, it will be converted to unicode before stripping";
4594
4595static PyObject *
4596unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4597{
4598 if (PyTuple_GET_SIZE(args) == 0)
4599 return do_strip(self, RIGHTSTRIP); /* Common case */
4600 else
4601 return do_argstrip(self, RIGHTSTRIP, args);
4602}
4603
4604
Guido van Rossumd57fd912000-03-10 22:53:23 +00004605static PyObject*
4606unicode_repeat(PyUnicodeObject *str, int len)
4607{
4608 PyUnicodeObject *u;
4609 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004610 int nchars;
4611 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612
4613 if (len < 0)
4614 len = 0;
4615
Tim Peters7a29bd52001-09-12 03:03:31 +00004616 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004617 /* no repeat, return original string */
4618 Py_INCREF(str);
4619 return (PyObject*) str;
4620 }
Tim Peters8f422462000-09-09 06:13:41 +00004621
4622 /* ensure # of chars needed doesn't overflow int and # of bytes
4623 * needed doesn't overflow size_t
4624 */
4625 nchars = len * str->length;
4626 if (len && nchars / len != str->length) {
4627 PyErr_SetString(PyExc_OverflowError,
4628 "repeated string is too long");
4629 return NULL;
4630 }
4631 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4632 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4633 PyErr_SetString(PyExc_OverflowError,
4634 "repeated string is too long");
4635 return NULL;
4636 }
4637 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004638 if (!u)
4639 return NULL;
4640
4641 p = u->str;
4642
4643 while (len-- > 0) {
4644 Py_UNICODE_COPY(p, str->str, str->length);
4645 p += str->length;
4646 }
4647
4648 return (PyObject*) u;
4649}
4650
4651PyObject *PyUnicode_Replace(PyObject *obj,
4652 PyObject *subobj,
4653 PyObject *replobj,
4654 int maxcount)
4655{
4656 PyObject *self;
4657 PyObject *str1;
4658 PyObject *str2;
4659 PyObject *result;
4660
4661 self = PyUnicode_FromObject(obj);
4662 if (self == NULL)
4663 return NULL;
4664 str1 = PyUnicode_FromObject(subobj);
4665 if (str1 == NULL) {
4666 Py_DECREF(self);
4667 return NULL;
4668 }
4669 str2 = PyUnicode_FromObject(replobj);
4670 if (str2 == NULL) {
4671 Py_DECREF(self);
4672 Py_DECREF(str1);
4673 return NULL;
4674 }
4675 result = replace((PyUnicodeObject *)self,
4676 (PyUnicodeObject *)str1,
4677 (PyUnicodeObject *)str2,
4678 maxcount);
4679 Py_DECREF(self);
4680 Py_DECREF(str1);
4681 Py_DECREF(str2);
4682 return result;
4683}
4684
4685static char replace__doc__[] =
4686"S.replace (old, new[, maxsplit]) -> unicode\n\
4687\n\
4688Return a copy of S with all occurrences of substring\n\
4689old replaced by new. If the optional argument maxsplit is\n\
4690given, only the first maxsplit occurrences are replaced.";
4691
4692static PyObject*
4693unicode_replace(PyUnicodeObject *self, PyObject *args)
4694{
4695 PyUnicodeObject *str1;
4696 PyUnicodeObject *str2;
4697 int maxcount = -1;
4698 PyObject *result;
4699
4700 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4701 return NULL;
4702 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4703 if (str1 == NULL)
4704 return NULL;
4705 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4706 if (str2 == NULL)
4707 return NULL;
4708
4709 result = replace(self, str1, str2, maxcount);
4710
4711 Py_DECREF(str1);
4712 Py_DECREF(str2);
4713 return result;
4714}
4715
4716static
4717PyObject *unicode_repr(PyObject *unicode)
4718{
4719 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4720 PyUnicode_GET_SIZE(unicode),
4721 1);
4722}
4723
4724static char rfind__doc__[] =
4725"S.rfind(sub [,start [,end]]) -> int\n\
4726\n\
4727Return the highest index in S where substring sub is found,\n\
4728such that sub is contained within s[start,end]. Optional\n\
4729arguments start and end are interpreted as in slice notation.\n\
4730\n\
4731Return -1 on failure.";
4732
4733static PyObject *
4734unicode_rfind(PyUnicodeObject *self, PyObject *args)
4735{
4736 PyUnicodeObject *substring;
4737 int start = 0;
4738 int end = INT_MAX;
4739 PyObject *result;
4740
Guido van Rossumb8872e62000-05-09 14:14:27 +00004741 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4742 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743 return NULL;
4744 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4745 (PyObject *)substring);
4746 if (substring == NULL)
4747 return NULL;
4748
4749 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4750
4751 Py_DECREF(substring);
4752 return result;
4753}
4754
4755static char rindex__doc__[] =
4756"S.rindex(sub [,start [,end]]) -> int\n\
4757\n\
4758Like S.rfind() but raise ValueError when the substring is not found.";
4759
4760static PyObject *
4761unicode_rindex(PyUnicodeObject *self, PyObject *args)
4762{
4763 int result;
4764 PyUnicodeObject *substring;
4765 int start = 0;
4766 int end = INT_MAX;
4767
Guido van Rossumb8872e62000-05-09 14:14:27 +00004768 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4769 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770 return NULL;
4771 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4772 (PyObject *)substring);
4773 if (substring == NULL)
4774 return NULL;
4775
4776 result = findstring(self, substring, start, end, -1);
4777
4778 Py_DECREF(substring);
4779 if (result < 0) {
4780 PyErr_SetString(PyExc_ValueError, "substring not found");
4781 return NULL;
4782 }
4783 return PyInt_FromLong(result);
4784}
4785
4786static char rjust__doc__[] =
4787"S.rjust(width) -> unicode\n\
4788\n\
4789Return S right justified in a Unicode string of length width. Padding is\n\
4790done using spaces.";
4791
4792static PyObject *
4793unicode_rjust(PyUnicodeObject *self, PyObject *args)
4794{
4795 int width;
4796 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4797 return NULL;
4798
Tim Peters7a29bd52001-09-12 03:03:31 +00004799 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800 Py_INCREF(self);
4801 return (PyObject*) self;
4802 }
4803
4804 return (PyObject*) pad(self, width - self->length, 0, ' ');
4805}
4806
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807static PyObject*
4808unicode_slice(PyUnicodeObject *self, int start, int end)
4809{
4810 /* standard clamping */
4811 if (start < 0)
4812 start = 0;
4813 if (end < 0)
4814 end = 0;
4815 if (end > self->length)
4816 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004817 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818 /* full slice, return original string */
4819 Py_INCREF(self);
4820 return (PyObject*) self;
4821 }
4822 if (start > end)
4823 start = end;
4824 /* copy slice */
4825 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4826 end - start);
4827}
4828
4829PyObject *PyUnicode_Split(PyObject *s,
4830 PyObject *sep,
4831 int maxsplit)
4832{
4833 PyObject *result;
4834
4835 s = PyUnicode_FromObject(s);
4836 if (s == NULL)
4837 return NULL;
4838 if (sep != NULL) {
4839 sep = PyUnicode_FromObject(sep);
4840 if (sep == NULL) {
4841 Py_DECREF(s);
4842 return NULL;
4843 }
4844 }
4845
4846 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4847
4848 Py_DECREF(s);
4849 Py_XDECREF(sep);
4850 return result;
4851}
4852
4853static char split__doc__[] =
4854"S.split([sep [,maxsplit]]) -> list of strings\n\
4855\n\
4856Return a list of the words in S, using sep as the\n\
4857delimiter string. If maxsplit is given, at most maxsplit\n\
4858splits are done. If sep is not specified, any whitespace string\n\
4859is a separator.";
4860
4861static PyObject*
4862unicode_split(PyUnicodeObject *self, PyObject *args)
4863{
4864 PyObject *substring = Py_None;
4865 int maxcount = -1;
4866
4867 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4868 return NULL;
4869
4870 if (substring == Py_None)
4871 return split(self, NULL, maxcount);
4872 else if (PyUnicode_Check(substring))
4873 return split(self, (PyUnicodeObject *)substring, maxcount);
4874 else
4875 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4876}
4877
4878static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004879"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880\n\
4881Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004882Line breaks are not included in the resulting list unless keepends\n\
4883is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884
4885static PyObject*
4886unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4887{
Guido van Rossum86662912000-04-11 15:38:46 +00004888 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889
Guido van Rossum86662912000-04-11 15:38:46 +00004890 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 return NULL;
4892
Guido van Rossum86662912000-04-11 15:38:46 +00004893 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894}
4895
4896static
4897PyObject *unicode_str(PyUnicodeObject *self)
4898{
Fred Drakee4315f52000-05-09 19:53:39 +00004899 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900}
4901
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902static char swapcase__doc__[] =
4903"S.swapcase() -> unicode\n\
4904\n\
4905Return a copy of S with uppercase characters converted to lowercase\n\
4906and vice versa.";
4907
4908static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004909unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911 return fixup(self, fixswapcase);
4912}
4913
4914static char translate__doc__[] =
4915"S.translate(table) -> unicode\n\
4916\n\
4917Return a copy of the string S, where all characters have been mapped\n\
4918through the given translation table, which must be a mapping of\n\
4919Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4920are left untouched. Characters mapped to None are deleted.";
4921
4922static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004923unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925 return PyUnicode_TranslateCharmap(self->str,
4926 self->length,
4927 table,
4928 "ignore");
4929}
4930
4931static char upper__doc__[] =
4932"S.upper() -> unicode\n\
4933\n\
4934Return a copy of S converted to uppercase.";
4935
4936static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004937unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939 return fixup(self, fixupper);
4940}
4941
Guido van Rossumd57fd912000-03-10 22:53:23 +00004942static char zfill__doc__[] =
4943"S.zfill(width) -> unicode\n\
4944\n\
4945Pad a numeric string x with zeros on the left, to fill a field\n\
4946of the specified width. The string x is never truncated.";
4947
4948static PyObject *
4949unicode_zfill(PyUnicodeObject *self, PyObject *args)
4950{
4951 int fill;
4952 PyUnicodeObject *u;
4953
4954 int width;
4955 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4956 return NULL;
4957
4958 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00004959 if (PyUnicode_CheckExact(self)) {
4960 Py_INCREF(self);
4961 return (PyObject*) self;
4962 }
4963 else
4964 return PyUnicode_FromUnicode(
4965 PyUnicode_AS_UNICODE(self),
4966 PyUnicode_GET_SIZE(self)
4967 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968 }
4969
4970 fill = width - self->length;
4971
4972 u = pad(self, fill, 0, '0');
4973
Walter Dörwald068325e2002-04-15 13:36:47 +00004974 if (u == NULL)
4975 return NULL;
4976
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977 if (u->str[fill] == '+' || u->str[fill] == '-') {
4978 /* move sign to beginning of string */
4979 u->str[0] = u->str[fill];
4980 u->str[fill] = '0';
4981 }
4982
4983 return (PyObject*) u;
4984}
Guido van Rossumd57fd912000-03-10 22:53:23 +00004985
4986#if 0
4987static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004988unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990 return PyInt_FromLong(unicode_freelist_size);
4991}
4992#endif
4993
4994static char startswith__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004995"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004996\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004997Return True if S starts with the specified prefix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998optional start, test S beginning at that position. With optional end, stop\n\
4999comparing S at that position.";
5000
5001static PyObject *
5002unicode_startswith(PyUnicodeObject *self,
5003 PyObject *args)
5004{
5005 PyUnicodeObject *substring;
5006 int start = 0;
5007 int end = INT_MAX;
5008 PyObject *result;
5009
Guido van Rossumb8872e62000-05-09 14:14:27 +00005010 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5011 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012 return NULL;
5013 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5014 (PyObject *)substring);
5015 if (substring == NULL)
5016 return NULL;
5017
Guido van Rossum77f6a652002-04-03 22:41:51 +00005018 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005019
5020 Py_DECREF(substring);
5021 return result;
5022}
5023
5024
5025static char endswith__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00005026"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005027\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005028Return True if S ends with the specified suffix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005029optional start, test S beginning at that position. With optional end, stop\n\
5030comparing S at that position.";
5031
5032static PyObject *
5033unicode_endswith(PyUnicodeObject *self,
5034 PyObject *args)
5035{
5036 PyUnicodeObject *substring;
5037 int start = 0;
5038 int end = INT_MAX;
5039 PyObject *result;
5040
Guido van Rossumb8872e62000-05-09 14:14:27 +00005041 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5042 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005043 return NULL;
5044 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5045 (PyObject *)substring);
5046 if (substring == NULL)
5047 return NULL;
5048
Guido van Rossum77f6a652002-04-03 22:41:51 +00005049 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050
5051 Py_DECREF(substring);
5052 return result;
5053}
5054
5055
5056static PyMethodDef unicode_methods[] = {
5057
5058 /* Order is according to common usage: often used methods should
5059 appear first, since lookup is done sequentially. */
5060
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005061 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5062 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5063 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5064 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5065 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5066 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5067 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5068 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5069 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5070 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5071 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5072 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5073 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005074 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005075/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5076 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5077 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5078 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005079 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005080 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005081 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005082 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5083 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5084 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5085 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5086 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5087 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5088 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5089 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5090 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5091 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5092 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5093 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5094 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5095 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005096 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00005097#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005098 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099#endif
5100
5101#if 0
5102 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005103 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104#endif
5105
5106 {NULL, NULL}
5107};
5108
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109static PySequenceMethods unicode_as_sequence = {
5110 (inquiry) unicode_length, /* sq_length */
5111 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5112 (intargfunc) unicode_repeat, /* sq_repeat */
5113 (intargfunc) unicode_getitem, /* sq_item */
5114 (intintargfunc) unicode_slice, /* sq_slice */
5115 0, /* sq_ass_item */
5116 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005117 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118};
5119
5120static int
5121unicode_buffer_getreadbuf(PyUnicodeObject *self,
5122 int index,
5123 const void **ptr)
5124{
5125 if (index != 0) {
5126 PyErr_SetString(PyExc_SystemError,
5127 "accessing non-existent unicode segment");
5128 return -1;
5129 }
5130 *ptr = (void *) self->str;
5131 return PyUnicode_GET_DATA_SIZE(self);
5132}
5133
5134static int
5135unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5136 const void **ptr)
5137{
5138 PyErr_SetString(PyExc_TypeError,
5139 "cannot use unicode as modifyable buffer");
5140 return -1;
5141}
5142
5143static int
5144unicode_buffer_getsegcount(PyUnicodeObject *self,
5145 int *lenp)
5146{
5147 if (lenp)
5148 *lenp = PyUnicode_GET_DATA_SIZE(self);
5149 return 1;
5150}
5151
5152static int
5153unicode_buffer_getcharbuf(PyUnicodeObject *self,
5154 int index,
5155 const void **ptr)
5156{
5157 PyObject *str;
5158
5159 if (index != 0) {
5160 PyErr_SetString(PyExc_SystemError,
5161 "accessing non-existent unicode segment");
5162 return -1;
5163 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005164 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165 if (str == NULL)
5166 return -1;
5167 *ptr = (void *) PyString_AS_STRING(str);
5168 return PyString_GET_SIZE(str);
5169}
5170
5171/* Helpers for PyUnicode_Format() */
5172
5173static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005174getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175{
5176 int argidx = *p_argidx;
5177 if (argidx < arglen) {
5178 (*p_argidx)++;
5179 if (arglen < 0)
5180 return args;
5181 else
5182 return PyTuple_GetItem(args, argidx);
5183 }
5184 PyErr_SetString(PyExc_TypeError,
5185 "not enough arguments for format string");
5186 return NULL;
5187}
5188
5189#define F_LJUST (1<<0)
5190#define F_SIGN (1<<1)
5191#define F_BLANK (1<<2)
5192#define F_ALT (1<<3)
5193#define F_ZERO (1<<4)
5194
5195static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197{
5198 register int i;
5199 int len;
5200 va_list va;
5201 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203
5204 /* First, format the string as char array, then expand to Py_UNICODE
5205 array. */
5206 charbuffer = (char *)buffer;
5207 len = vsprintf(charbuffer, format, va);
5208 for (i = len - 1; i >= 0; i--)
5209 buffer[i] = (Py_UNICODE) charbuffer[i];
5210
5211 va_end(va);
5212 return len;
5213}
5214
5215static int
5216formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005217 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218 int flags,
5219 int prec,
5220 int type,
5221 PyObject *v)
5222{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005223 /* fmt = '%#.' + `prec` + `type`
5224 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225 char fmt[20];
5226 double x;
5227
5228 x = PyFloat_AsDouble(v);
5229 if (x == -1.0 && PyErr_Occurred())
5230 return -1;
5231 if (prec < 0)
5232 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5234 type = 'g';
Barry Warsawe5c492d2001-11-28 21:00:41 +00005235 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5236 (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005237 /* worst case length calc to ensure no buffer overrun:
5238 fmt = %#.<prec>g
5239 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5240 for any double rep.)
5241 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5242 If prec=0 the effective precision is 1 (the leading digit is
5243 always given), therefore increase by one to 10+prec. */
5244 if (buflen <= (size_t)10 + (size_t)prec) {
5245 PyErr_SetString(PyExc_OverflowError,
5246 "formatted float is too long (precision too long?)");
5247 return -1;
5248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249 return usprintf(buf, fmt, x);
5250}
5251
Tim Peters38fd5b62000-09-21 05:43:11 +00005252static PyObject*
5253formatlong(PyObject *val, int flags, int prec, int type)
5254{
5255 char *buf;
5256 int i, len;
5257 PyObject *str; /* temporary string object. */
5258 PyUnicodeObject *result;
5259
5260 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5261 if (!str)
5262 return NULL;
5263 result = _PyUnicode_New(len);
5264 for (i = 0; i < len; i++)
5265 result->str[i] = buf[i];
5266 result->str[len] = 0;
5267 Py_DECREF(str);
5268 return (PyObject*)result;
5269}
5270
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271static int
5272formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005273 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274 int flags,
5275 int prec,
5276 int type,
5277 PyObject *v)
5278{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005279 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005280 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5281 * + 1 + 1
5282 * = 24
5283 */
Tim Peters38fd5b62000-09-21 05:43:11 +00005284 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285 long x;
5286
5287 x = PyInt_AsLong(v);
5288 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005289 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005291 prec = 1;
5292
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005293 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005294 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
5295 */
5296 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005297 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005298 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005299 return -1;
5300 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005301
5302 if ((flags & F_ALT) &&
5303 (type == 'x' || type == 'X')) {
5304 /* When converting under %#x or %#X, there are a number
5305 * of issues that cause pain:
5306 * - when 0 is being converted, the C standard leaves off
5307 * the '0x' or '0X', which is inconsistent with other
5308 * %#x/%#X conversions and inconsistent with Python's
5309 * hex() function
5310 * - there are platforms that violate the standard and
5311 * convert 0 with the '0x' or '0X'
5312 * (Metrowerks, Compaq Tru64)
5313 * - there are platforms that give '0x' when converting
5314 * under %#X, but convert 0 in accordance with the
5315 * standard (OS/2 EMX)
5316 *
5317 * We can achieve the desired consistency by inserting our
5318 * own '0x' or '0X' prefix, and substituting %x/%X in place
5319 * of %#x/%#X.
5320 *
5321 * Note that this is the same approach as used in
5322 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005323 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005324 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
5325 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005326 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005327 else {
5328 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5329 (flags&F_ALT) ? "#" : "",
5330 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005331 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 return usprintf(buf, fmt, x);
5333}
5334
5335static int
5336formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005337 size_t buflen,
5338 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005340 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005341 if (PyUnicode_Check(v)) {
5342 if (PyUnicode_GET_SIZE(v) != 1)
5343 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005347 else if (PyString_Check(v)) {
5348 if (PyString_GET_SIZE(v) != 1)
5349 goto onError;
5350 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5351 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352
5353 else {
5354 /* Integer input truncated to a character */
5355 long x;
5356 x = PyInt_AsLong(v);
5357 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005358 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 buf[0] = (char) x;
5360 }
5361 buf[1] = '\0';
5362 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005363
5364 onError:
5365 PyErr_SetString(PyExc_TypeError,
5366 "%c requires int or char");
5367 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368}
5369
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005370/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5371
5372 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5373 chars are formatted. XXX This is a magic number. Each formatting
5374 routine does bounds checking to ensure no overflow, but a better
5375 solution may be to malloc a buffer of appropriate size for each
5376 format. For now, the current solution is sufficient.
5377*/
5378#define FORMATBUFLEN (size_t)120
5379
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380PyObject *PyUnicode_Format(PyObject *format,
5381 PyObject *args)
5382{
5383 Py_UNICODE *fmt, *res;
5384 int fmtcnt, rescnt, reslen, arglen, argidx;
5385 int args_owned = 0;
5386 PyUnicodeObject *result = NULL;
5387 PyObject *dict = NULL;
5388 PyObject *uformat;
5389
5390 if (format == NULL || args == NULL) {
5391 PyErr_BadInternalCall();
5392 return NULL;
5393 }
5394 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005395 if (uformat == NULL)
5396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 fmt = PyUnicode_AS_UNICODE(uformat);
5398 fmtcnt = PyUnicode_GET_SIZE(uformat);
5399
5400 reslen = rescnt = fmtcnt + 100;
5401 result = _PyUnicode_New(reslen);
5402 if (result == NULL)
5403 goto onError;
5404 res = PyUnicode_AS_UNICODE(result);
5405
5406 if (PyTuple_Check(args)) {
5407 arglen = PyTuple_Size(args);
5408 argidx = 0;
5409 }
5410 else {
5411 arglen = -1;
5412 argidx = -2;
5413 }
5414 if (args->ob_type->tp_as_mapping)
5415 dict = args;
5416
5417 while (--fmtcnt >= 0) {
5418 if (*fmt != '%') {
5419 if (--rescnt < 0) {
5420 rescnt = fmtcnt + 100;
5421 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005422 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423 return NULL;
5424 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5425 --rescnt;
5426 }
5427 *res++ = *fmt++;
5428 }
5429 else {
5430 /* Got a format specifier */
5431 int flags = 0;
5432 int width = -1;
5433 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434 Py_UNICODE c = '\0';
5435 Py_UNICODE fill;
5436 PyObject *v = NULL;
5437 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005438 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439 Py_UNICODE sign;
5440 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005441 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442
5443 fmt++;
5444 if (*fmt == '(') {
5445 Py_UNICODE *keystart;
5446 int keylen;
5447 PyObject *key;
5448 int pcount = 1;
5449
5450 if (dict == NULL) {
5451 PyErr_SetString(PyExc_TypeError,
5452 "format requires a mapping");
5453 goto onError;
5454 }
5455 ++fmt;
5456 --fmtcnt;
5457 keystart = fmt;
5458 /* Skip over balanced parentheses */
5459 while (pcount > 0 && --fmtcnt >= 0) {
5460 if (*fmt == ')')
5461 --pcount;
5462 else if (*fmt == '(')
5463 ++pcount;
5464 fmt++;
5465 }
5466 keylen = fmt - keystart - 1;
5467 if (fmtcnt < 0 || pcount > 0) {
5468 PyErr_SetString(PyExc_ValueError,
5469 "incomplete format key");
5470 goto onError;
5471 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005472#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00005473 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 then looked up since Python uses strings to hold
5475 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005476 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 key = PyUnicode_EncodeUTF8(keystart,
5478 keylen,
5479 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005480#else
5481 key = PyUnicode_FromUnicode(keystart, keylen);
5482#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483 if (key == NULL)
5484 goto onError;
5485 if (args_owned) {
5486 Py_DECREF(args);
5487 args_owned = 0;
5488 }
5489 args = PyObject_GetItem(dict, key);
5490 Py_DECREF(key);
5491 if (args == NULL) {
5492 goto onError;
5493 }
5494 args_owned = 1;
5495 arglen = -1;
5496 argidx = -2;
5497 }
5498 while (--fmtcnt >= 0) {
5499 switch (c = *fmt++) {
5500 case '-': flags |= F_LJUST; continue;
5501 case '+': flags |= F_SIGN; continue;
5502 case ' ': flags |= F_BLANK; continue;
5503 case '#': flags |= F_ALT; continue;
5504 case '0': flags |= F_ZERO; continue;
5505 }
5506 break;
5507 }
5508 if (c == '*') {
5509 v = getnextarg(args, arglen, &argidx);
5510 if (v == NULL)
5511 goto onError;
5512 if (!PyInt_Check(v)) {
5513 PyErr_SetString(PyExc_TypeError,
5514 "* wants int");
5515 goto onError;
5516 }
5517 width = PyInt_AsLong(v);
5518 if (width < 0) {
5519 flags |= F_LJUST;
5520 width = -width;
5521 }
5522 if (--fmtcnt >= 0)
5523 c = *fmt++;
5524 }
5525 else if (c >= '0' && c <= '9') {
5526 width = c - '0';
5527 while (--fmtcnt >= 0) {
5528 c = *fmt++;
5529 if (c < '0' || c > '9')
5530 break;
5531 if ((width*10) / 10 != width) {
5532 PyErr_SetString(PyExc_ValueError,
5533 "width too big");
5534 goto onError;
5535 }
5536 width = width*10 + (c - '0');
5537 }
5538 }
5539 if (c == '.') {
5540 prec = 0;
5541 if (--fmtcnt >= 0)
5542 c = *fmt++;
5543 if (c == '*') {
5544 v = getnextarg(args, arglen, &argidx);
5545 if (v == NULL)
5546 goto onError;
5547 if (!PyInt_Check(v)) {
5548 PyErr_SetString(PyExc_TypeError,
5549 "* wants int");
5550 goto onError;
5551 }
5552 prec = PyInt_AsLong(v);
5553 if (prec < 0)
5554 prec = 0;
5555 if (--fmtcnt >= 0)
5556 c = *fmt++;
5557 }
5558 else if (c >= '0' && c <= '9') {
5559 prec = c - '0';
5560 while (--fmtcnt >= 0) {
5561 c = Py_CHARMASK(*fmt++);
5562 if (c < '0' || c > '9')
5563 break;
5564 if ((prec*10) / 10 != prec) {
5565 PyErr_SetString(PyExc_ValueError,
5566 "prec too big");
5567 goto onError;
5568 }
5569 prec = prec*10 + (c - '0');
5570 }
5571 }
5572 } /* prec */
5573 if (fmtcnt >= 0) {
5574 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575 if (--fmtcnt >= 0)
5576 c = *fmt++;
5577 }
5578 }
5579 if (fmtcnt < 0) {
5580 PyErr_SetString(PyExc_ValueError,
5581 "incomplete format");
5582 goto onError;
5583 }
5584 if (c != '%') {
5585 v = getnextarg(args, arglen, &argidx);
5586 if (v == NULL)
5587 goto onError;
5588 }
5589 sign = 0;
5590 fill = ' ';
5591 switch (c) {
5592
5593 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005594 pbuf = formatbuf;
5595 /* presume that buffer length is at least 1 */
5596 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 len = 1;
5598 break;
5599
5600 case 's':
5601 case 'r':
5602 if (PyUnicode_Check(v) && c == 's') {
5603 temp = v;
5604 Py_INCREF(temp);
5605 }
5606 else {
5607 PyObject *unicode;
5608 if (c == 's')
5609 temp = PyObject_Str(v);
5610 else
5611 temp = PyObject_Repr(v);
5612 if (temp == NULL)
5613 goto onError;
5614 if (!PyString_Check(temp)) {
5615 /* XXX Note: this should never happen, since
5616 PyObject_Repr() and PyObject_Str() assure
5617 this */
5618 Py_DECREF(temp);
5619 PyErr_SetString(PyExc_TypeError,
5620 "%s argument has non-string str()");
5621 goto onError;
5622 }
Fred Drakee4315f52000-05-09 19:53:39 +00005623 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005625 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626 "strict");
5627 Py_DECREF(temp);
5628 temp = unicode;
5629 if (temp == NULL)
5630 goto onError;
5631 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005632 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633 len = PyUnicode_GET_SIZE(temp);
5634 if (prec >= 0 && len > prec)
5635 len = prec;
5636 break;
5637
5638 case 'i':
5639 case 'd':
5640 case 'u':
5641 case 'o':
5642 case 'x':
5643 case 'X':
5644 if (c == 'i')
5645 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005646 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005647 temp = formatlong(v, flags, prec, c);
5648 if (!temp)
5649 goto onError;
5650 pbuf = PyUnicode_AS_UNICODE(temp);
5651 len = PyUnicode_GET_SIZE(temp);
5652 /* unbounded ints can always produce
5653 a sign character! */
5654 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005656 else {
5657 pbuf = formatbuf;
5658 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5659 flags, prec, c, v);
5660 if (len < 0)
5661 goto onError;
5662 /* only d conversion is signed */
5663 sign = c == 'd';
5664 }
5665 if (flags & F_ZERO)
5666 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 break;
5668
5669 case 'e':
5670 case 'E':
5671 case 'f':
5672 case 'g':
5673 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005674 pbuf = formatbuf;
5675 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5676 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 if (len < 0)
5678 goto onError;
5679 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005680 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681 fill = '0';
5682 break;
5683
5684 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005685 pbuf = formatbuf;
5686 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 if (len < 0)
5688 goto onError;
5689 break;
5690
5691 default:
5692 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005693 "unsupported format character '%c' (0x%x) "
5694 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005695 (31<=c && c<=126) ? c : '?',
5696 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 goto onError;
5698 }
5699 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005700 if (*pbuf == '-' || *pbuf == '+') {
5701 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 len--;
5703 }
5704 else if (flags & F_SIGN)
5705 sign = '+';
5706 else if (flags & F_BLANK)
5707 sign = ' ';
5708 else
5709 sign = 0;
5710 }
5711 if (width < len)
5712 width = len;
5713 if (rescnt < width + (sign != 0)) {
5714 reslen -= rescnt;
5715 rescnt = width + fmtcnt + 100;
5716 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005717 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718 return NULL;
5719 res = PyUnicode_AS_UNICODE(result)
5720 + reslen - rescnt;
5721 }
5722 if (sign) {
5723 if (fill != ' ')
5724 *res++ = sign;
5725 rescnt--;
5726 if (width > len)
5727 width--;
5728 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005729 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5730 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005731 assert(pbuf[1] == c);
5732 if (fill != ' ') {
5733 *res++ = *pbuf++;
5734 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005735 }
Tim Petersfff53252001-04-12 18:38:48 +00005736 rescnt -= 2;
5737 width -= 2;
5738 if (width < 0)
5739 width = 0;
5740 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005741 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 if (width > len && !(flags & F_LJUST)) {
5743 do {
5744 --rescnt;
5745 *res++ = fill;
5746 } while (--width > len);
5747 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005748 if (fill == ' ') {
5749 if (sign)
5750 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005751 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005752 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005753 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005754 *res++ = *pbuf++;
5755 *res++ = *pbuf++;
5756 }
5757 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005758 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 res += len;
5760 rescnt -= len;
5761 while (--width >= len) {
5762 --rescnt;
5763 *res++ = ' ';
5764 }
5765 if (dict && (argidx < arglen) && c != '%') {
5766 PyErr_SetString(PyExc_TypeError,
5767 "not all arguments converted");
5768 goto onError;
5769 }
5770 Py_XDECREF(temp);
5771 } /* '%' */
5772 } /* until end */
5773 if (argidx < arglen && !dict) {
5774 PyErr_SetString(PyExc_TypeError,
5775 "not all arguments converted");
5776 goto onError;
5777 }
5778
5779 if (args_owned) {
5780 Py_DECREF(args);
5781 }
5782 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005783 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005784 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785 return (PyObject *)result;
5786
5787 onError:
5788 Py_XDECREF(result);
5789 Py_DECREF(uformat);
5790 if (args_owned) {
5791 Py_DECREF(args);
5792 }
5793 return NULL;
5794}
5795
5796static PyBufferProcs unicode_as_buffer = {
5797 (getreadbufferproc) unicode_buffer_getreadbuf,
5798 (getwritebufferproc) unicode_buffer_getwritebuf,
5799 (getsegcountproc) unicode_buffer_getsegcount,
5800 (getcharbufferproc) unicode_buffer_getcharbuf,
5801};
5802
Guido van Rossume023fe02001-08-30 03:12:59 +00005803staticforward PyObject *
5804unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5805
Tim Peters6d6c1a32001-08-02 04:15:00 +00005806static PyObject *
5807unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5808{
5809 PyObject *x = NULL;
5810 static char *kwlist[] = {"string", "encoding", "errors", 0};
5811 char *encoding = NULL;
5812 char *errors = NULL;
5813
Guido van Rossume023fe02001-08-30 03:12:59 +00005814 if (type != &PyUnicode_Type)
5815 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005816 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5817 kwlist, &x, &encoding, &errors))
5818 return NULL;
5819 if (x == NULL)
5820 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00005821 if (encoding == NULL && errors == NULL)
5822 return PyObject_Unicode(x);
5823 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00005824 return PyUnicode_FromEncodedObject(x, encoding, errors);
5825}
5826
Guido van Rossume023fe02001-08-30 03:12:59 +00005827static PyObject *
5828unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5829{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005830 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005831 int n;
5832
5833 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5834 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5835 if (tmp == NULL)
5836 return NULL;
5837 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005838 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5839 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005840 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005841 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5842 if (pnew->str == NULL) {
5843 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005844 PyObject_Del(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005845 return NULL;
5846 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005847 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5848 pnew->length = n;
5849 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005850 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005851 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005852}
5853
Tim Peters6d6c1a32001-08-02 04:15:00 +00005854static char unicode_doc[] =
5855"unicode(string [, encoding[, errors]]) -> object\n\
5856\n\
5857Create a new Unicode object from the given encoded string.\n\
5858encoding defaults to the current default string encoding and \n\
5859errors, defining the error handling, to 'strict'.";
5860
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861PyTypeObject PyUnicode_Type = {
5862 PyObject_HEAD_INIT(&PyType_Type)
5863 0, /* ob_size */
5864 "unicode", /* tp_name */
5865 sizeof(PyUnicodeObject), /* tp_size */
5866 0, /* tp_itemsize */
5867 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00005868 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005870 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 0, /* tp_setattr */
5872 (cmpfunc) unicode_compare, /* tp_compare */
5873 (reprfunc) unicode_repr, /* tp_repr */
5874 0, /* tp_as_number */
5875 &unicode_as_sequence, /* tp_as_sequence */
5876 0, /* tp_as_mapping */
5877 (hashfunc) unicode_hash, /* tp_hash*/
5878 0, /* tp_call*/
5879 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005880 PyObject_GenericGetAttr, /* tp_getattro */
5881 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005883 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005884 unicode_doc, /* tp_doc */
5885 0, /* tp_traverse */
5886 0, /* tp_clear */
5887 0, /* tp_richcompare */
5888 0, /* tp_weaklistoffset */
5889 0, /* tp_iter */
5890 0, /* tp_iternext */
5891 unicode_methods, /* tp_methods */
5892 0, /* tp_members */
5893 0, /* tp_getset */
5894 0, /* tp_base */
5895 0, /* tp_dict */
5896 0, /* tp_descr_get */
5897 0, /* tp_descr_set */
5898 0, /* tp_dictoffset */
5899 0, /* tp_init */
5900 0, /* tp_alloc */
5901 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005902 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903};
5904
5905/* Initialize the Unicode implementation */
5906
Thomas Wouters78890102000-07-22 19:25:51 +00005907void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005909 int i;
5910
Fred Drakee4315f52000-05-09 19:53:39 +00005911 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005912 unicode_freelist = NULL;
5913 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005915 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005916 for (i = 0; i < 256; i++)
5917 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918}
5919
5920/* Finalize the Unicode implementation */
5921
5922void
Thomas Wouters78890102000-07-22 19:25:51 +00005923_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005925 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005926 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005928 Py_XDECREF(unicode_empty);
5929 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005930
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005931 for (i = 0; i < 256; i++) {
5932 if (unicode_latin1[i]) {
5933 Py_DECREF(unicode_latin1[i]);
5934 unicode_latin1[i] = NULL;
5935 }
5936 }
5937
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005938 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 PyUnicodeObject *v = u;
5940 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005941 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005942 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005943 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005944 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005946 unicode_freelist = NULL;
5947 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948}