blob: af427dd11887c2ae8546b24f588e6a4fcccb729a [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000222 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000279 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000280 *unicode = (PyObject *)w;
281 return 0;
282 }
283
284 /* Note that we don't have to modify *unicode for unshared Unicode
285 objects, since we can modify them in-place. */
286 return unicode_resize(v, length);
287}
288
289/* Internal API for use in unicodeobject.c only ! */
290#define _PyUnicode_Resize(unicodevar, length) \
291 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
292
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
294 int size)
295{
296 PyUnicodeObject *unicode;
297
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000298 /* If the Unicode data is known at construction time, we can apply
299 some optimizations which share commonly used objects. */
300 if (u != NULL) {
301
302 /* Optimization for empty strings */
303 if (size == 0 && unicode_empty != NULL) {
304 Py_INCREF(unicode_empty);
305 return (PyObject *)unicode_empty;
306 }
307
308 /* Single character Unicode objects in the Latin-1 range are
309 shared when using this constructor */
310 if (size == 1 && *u < 256) {
311 unicode = unicode_latin1[*u];
312 if (!unicode) {
313 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000314 if (!unicode)
315 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000316 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000317 unicode_latin1[*u] = unicode;
318 }
319 Py_INCREF(unicode);
320 return (PyObject *)unicode;
321 }
322 }
323
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 unicode = _PyUnicode_New(size);
325 if (!unicode)
326 return NULL;
327
328 /* Copy the Unicode data into the new object */
329 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331
332 return (PyObject *)unicode;
333}
334
335#ifdef HAVE_WCHAR_H
336
337PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
338 int size)
339{
340 PyUnicodeObject *unicode;
341
342 if (w == NULL) {
343 PyErr_BadInternalCall();
344 return NULL;
345 }
346
347 unicode = _PyUnicode_New(size);
348 if (!unicode)
349 return NULL;
350
351 /* Copy the wchar_t data into the new object */
352#ifdef HAVE_USABLE_WCHAR_T
353 memcpy(unicode->str, w, size * sizeof(wchar_t));
354#else
355 {
356 register Py_UNICODE *u;
357 register int i;
358 u = PyUnicode_AS_UNICODE(unicode);
359 for (i = size; i >= 0; i--)
360 *u++ = *w++;
361 }
362#endif
363
364 return (PyObject *)unicode;
365}
366
367int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
368 register wchar_t *w,
369 int size)
370{
371 if (unicode == NULL) {
372 PyErr_BadInternalCall();
373 return -1;
374 }
375 if (size > PyUnicode_GET_SIZE(unicode))
376 size = PyUnicode_GET_SIZE(unicode);
377#ifdef HAVE_USABLE_WCHAR_T
378 memcpy(w, unicode->str, size * sizeof(wchar_t));
379#else
380 {
381 register Py_UNICODE *u;
382 register int i;
383 u = PyUnicode_AS_UNICODE(unicode);
384 for (i = size; i >= 0; i--)
385 *w++ = *u++;
386 }
387#endif
388
389 return size;
390}
391
392#endif
393
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000394PyObject *PyUnicode_FromOrdinal(int ordinal)
395{
396 Py_UNICODE s[2];
397
398#ifdef Py_UNICODE_WIDE
399 if (ordinal < 0 || ordinal > 0x10ffff) {
400 PyErr_SetString(PyExc_ValueError,
401 "unichr() arg not in range(0x110000) "
402 "(wide Python build)");
403 return NULL;
404 }
405#else
406 if (ordinal < 0 || ordinal > 0xffff) {
407 PyErr_SetString(PyExc_ValueError,
408 "unichr() arg not in range(0x10000) "
409 "(narrow Python build)");
410 return NULL;
411 }
412#endif
413
414 if (ordinal <= 0xffff) {
415 /* UCS-2 character */
416 s[0] = (Py_UNICODE) ordinal;
417 return PyUnicode_FromUnicode(s, 1);
418 }
419 else {
420#ifndef Py_UNICODE_WIDE
421 /* UCS-4 character. store as two surrogate characters */
422 ordinal -= 0x10000L;
423 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
424 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
425 return PyUnicode_FromUnicode(s, 2);
426#else
427 s[0] = (Py_UNICODE)ordinal;
428 return PyUnicode_FromUnicode(s, 1);
429#endif
430 }
431}
432
Guido van Rossumd57fd912000-03-10 22:53:23 +0000433PyObject *PyUnicode_FromObject(register PyObject *obj)
434{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000435 /* XXX Perhaps we should make this API an alias of
436 PyObject_Unicode() instead ?! */
437 if (PyUnicode_CheckExact(obj)) {
438 Py_INCREF(obj);
439 return obj;
440 }
441 if (PyUnicode_Check(obj)) {
442 /* For a Unicode subtype that's not a Unicode object,
443 return a true Unicode object with the same data. */
444 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
445 PyUnicode_GET_SIZE(obj));
446 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
448}
449
450PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
451 const char *encoding,
452 const char *errors)
453{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000454 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000456 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457
458 if (obj == NULL) {
459 PyErr_BadInternalCall();
460 return NULL;
461 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000462
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000463#if 0
464 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000465 that no encodings is given and then redirect to
466 PyObject_Unicode() which then applies the additional logic for
467 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000468
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000469 NOTE: This API should really only be used for object which
470 represent *encoded* Unicode !
471
472 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000473 if (PyUnicode_Check(obj)) {
474 if (encoding) {
475 PyErr_SetString(PyExc_TypeError,
476 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000477 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000478 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000479 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000480 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000481#else
482 if (PyUnicode_Check(obj)) {
483 PyErr_SetString(PyExc_TypeError,
484 "decoding Unicode is not supported");
485 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000486 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000487#endif
488
489 /* Coerce object */
490 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000491 s = PyString_AS_STRING(obj);
492 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000493 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000494 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
495 /* Overwrite the error message with something more useful in
496 case of a TypeError. */
497 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000498 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000499 "coercing to Unicode: need string or buffer, "
500 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000501 obj->ob_type->tp_name);
502 goto onError;
503 }
504
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000505 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 if (len == 0) {
507 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000510 else
511 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000512
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000513 return v;
514
515 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000517}
518
519PyObject *PyUnicode_Decode(const char *s,
520 int size,
521 const char *encoding,
522 const char *errors)
523{
524 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000525
526 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000527 encoding = PyUnicode_GetDefaultEncoding();
528
529 /* Shortcuts for common default encodings */
530 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000531 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000532 else if (strcmp(encoding, "latin-1") == 0)
533 return PyUnicode_DecodeLatin1(s, size, errors);
534 else if (strcmp(encoding, "ascii") == 0)
535 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000536
537 /* Decode via the codec registry */
538 buffer = PyBuffer_FromMemory((void *)s, size);
539 if (buffer == NULL)
540 goto onError;
541 unicode = PyCodec_Decode(buffer, encoding, errors);
542 if (unicode == NULL)
543 goto onError;
544 if (!PyUnicode_Check(unicode)) {
545 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000546 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000547 unicode->ob_type->tp_name);
548 Py_DECREF(unicode);
549 goto onError;
550 }
551 Py_DECREF(buffer);
552 return unicode;
553
554 onError:
555 Py_XDECREF(buffer);
556 return NULL;
557}
558
559PyObject *PyUnicode_Encode(const Py_UNICODE *s,
560 int size,
561 const char *encoding,
562 const char *errors)
563{
564 PyObject *v, *unicode;
565
566 unicode = PyUnicode_FromUnicode(s, size);
567 if (unicode == NULL)
568 return NULL;
569 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
570 Py_DECREF(unicode);
571 return v;
572}
573
574PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
575 const char *encoding,
576 const char *errors)
577{
578 PyObject *v;
579
580 if (!PyUnicode_Check(unicode)) {
581 PyErr_BadArgument();
582 goto onError;
583 }
Fred Drakee4315f52000-05-09 19:53:39 +0000584
585 if (encoding == NULL)
586 encoding = PyUnicode_GetDefaultEncoding();
587
588 /* Shortcuts for common default encodings */
589 if (errors == NULL) {
590 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000591 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000592 else if (strcmp(encoding, "latin-1") == 0)
593 return PyUnicode_AsLatin1String(unicode);
594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_AsASCIIString(unicode);
596 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597
598 /* Encode via the codec registry */
599 v = PyCodec_Encode(unicode, encoding, errors);
600 if (v == NULL)
601 goto onError;
602 /* XXX Should we really enforce this ? */
603 if (!PyString_Check(v)) {
604 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000605 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606 v->ob_type->tp_name);
607 Py_DECREF(v);
608 goto onError;
609 }
610 return v;
611
612 onError:
613 return NULL;
614}
615
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000616PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
617 const char *errors)
618{
619 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
620
621 if (v)
622 return v;
623 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
624 if (v && errors == NULL)
625 ((PyUnicodeObject *)unicode)->defenc = v;
626 return v;
627}
628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
630{
631 if (!PyUnicode_Check(unicode)) {
632 PyErr_BadArgument();
633 goto onError;
634 }
635 return PyUnicode_AS_UNICODE(unicode);
636
637 onError:
638 return NULL;
639}
640
641int PyUnicode_GetSize(PyObject *unicode)
642{
643 if (!PyUnicode_Check(unicode)) {
644 PyErr_BadArgument();
645 goto onError;
646 }
647 return PyUnicode_GET_SIZE(unicode);
648
649 onError:
650 return -1;
651}
652
Thomas Wouters78890102000-07-22 19:25:51 +0000653const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000654{
655 return unicode_default_encoding;
656}
657
658int PyUnicode_SetDefaultEncoding(const char *encoding)
659{
660 PyObject *v;
661
662 /* Make sure the encoding is valid. As side effect, this also
663 loads the encoding into the codec registry cache. */
664 v = _PyCodec_Lookup(encoding);
665 if (v == NULL)
666 goto onError;
667 Py_DECREF(v);
668 strncpy(unicode_default_encoding,
669 encoding,
670 sizeof(unicode_default_encoding));
671 return 0;
672
673 onError:
674 return -1;
675}
676
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000677/* error handling callback helper:
678 build arguments, call the callback and check the arguments,
679 if no exception occured, copy the replacement to the output
680 and adjust various state variables.
681 return 0 on success, -1 on error
682*/
683
684static
685int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
686 const char *encoding, const char *reason,
687 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
688 PyObject **output, int *outpos, Py_UNICODE **outptr)
689{
690 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
691
692 PyObject *restuple = NULL;
693 PyObject *repunicode = NULL;
694 int outsize = PyUnicode_GET_SIZE(*output);
695 int requiredsize;
696 int newpos;
697 Py_UNICODE *repptr;
698 int repsize;
699 int res = -1;
700
701 if (*errorHandler == NULL) {
702 *errorHandler = PyCodec_LookupError(errors);
703 if (*errorHandler == NULL)
704 goto onError;
705 }
706
707 if (*exceptionObject == NULL) {
708 *exceptionObject = PyUnicodeDecodeError_Create(
709 encoding, input, insize, *startinpos, *endinpos, reason);
710 if (*exceptionObject == NULL)
711 goto onError;
712 }
713 else {
714 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
715 goto onError;
716 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
717 goto onError;
718 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
719 goto onError;
720 }
721
722 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
723 if (restuple == NULL)
724 goto onError;
725 if (!PyTuple_Check(restuple)) {
726 PyErr_Format(PyExc_TypeError, &argparse[4]);
727 goto onError;
728 }
729 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
730 goto onError;
731 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000732 newpos = insize+newpos;
733 if (newpos<0 || newpos>insize) {
734 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
735 goto onError;
736 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000737
738 /* need more space? (at least enough for what we
739 have+the replacement+the rest of the string (starting
740 at the new input position), so we won't have to check space
741 when there are no errors in the rest of the string) */
742 repptr = PyUnicode_AS_UNICODE(repunicode);
743 repsize = PyUnicode_GET_SIZE(repunicode);
744 requiredsize = *outpos + repsize + insize-newpos;
745 if (requiredsize > outsize) {
746 if (requiredsize<2*outsize)
747 requiredsize = 2*outsize;
748 if (PyUnicode_Resize(output, requiredsize))
749 goto onError;
750 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
751 }
752 *endinpos = newpos;
753 *inptr = input + newpos;
754 Py_UNICODE_COPY(*outptr, repptr, repsize);
755 *outptr += repsize;
756 *outpos += repsize;
757 /* we made it! */
758 res = 0;
759
760 onError:
761 Py_XDECREF(restuple);
762 return res;
763}
764
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000765/* --- UTF-7 Codec -------------------------------------------------------- */
766
767/* see RFC2152 for details */
768
769static
770char utf7_special[128] = {
771 /* indicate whether a UTF-7 character is special i.e. cannot be directly
772 encoded:
773 0 - not special
774 1 - special
775 2 - whitespace (optional)
776 3 - RFC2152 Set O (optional) */
777 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
778 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
779 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
780 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
781 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
782 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
783 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
784 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
785
786};
787
788#define SPECIAL(c, encodeO, encodeWS) \
789 (((c)>127 || utf7_special[(c)] == 1) || \
790 (encodeWS && (utf7_special[(c)] == 2)) || \
791 (encodeO && (utf7_special[(c)] == 3)))
792
793#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
794#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
795#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
796 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
797
798#define ENCODE(out, ch, bits) \
799 while (bits >= 6) { \
800 *out++ = B64(ch >> (bits-6)); \
801 bits -= 6; \
802 }
803
804#define DECODE(out, ch, bits, surrogate) \
805 while (bits >= 16) { \
806 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
807 bits -= 16; \
808 if (surrogate) { \
809 /* We have already generated an error for the high surrogate
810 so let's not bother seeing if the low surrogate is correct or not */\
811 surrogate = 0; \
812 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
813 /* This is a surrogate pair. Unfortunately we can't represent \
814 it in a 16-bit character */ \
815 surrogate = 1; \
816 errmsg = "code pairs are not supported"; \
817 goto utf7Error; \
818 } else { \
819 *out++ = outCh; \
820 } \
821 } \
822
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000823PyObject *PyUnicode_DecodeUTF7(const char *s,
824 int size,
825 const char *errors)
826{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000827 const char *starts = s;
828 int startinpos;
829 int endinpos;
830 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000831 const char *e;
832 PyUnicodeObject *unicode;
833 Py_UNICODE *p;
834 const char *errmsg = "";
835 int inShift = 0;
836 unsigned int bitsleft = 0;
837 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000838 int surrogate = 0;
839 PyObject *errorHandler = NULL;
840 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000841
842 unicode = _PyUnicode_New(size);
843 if (!unicode)
844 return NULL;
845 if (size == 0)
846 return (PyObject *)unicode;
847
848 p = unicode->str;
849 e = s + size;
850
851 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000852 Py_UNICODE ch;
853 restart:
854 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000855
856 if (inShift) {
857 if ((ch == '-') || !B64CHAR(ch)) {
858 inShift = 0;
859 s++;
860
861 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
862 if (bitsleft >= 6) {
863 /* The shift sequence has a partial character in it. If
864 bitsleft < 6 then we could just classify it as padding
865 but that is not the case here */
866
867 errmsg = "partial character in shift sequence";
868 goto utf7Error;
869 }
870 /* According to RFC2152 the remaining bits should be zero. We
871 choose to signal an error/insert a replacement character
872 here so indicate the potential of a misencoded character. */
873
874 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
875 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
876 errmsg = "non-zero padding bits in shift sequence";
877 goto utf7Error;
878 }
879
880 if (ch == '-') {
881 if ((s < e) && (*(s) == '-')) {
882 *p++ = '-';
883 inShift = 1;
884 }
885 } else if (SPECIAL(ch,0,0)) {
886 errmsg = "unexpected special character";
887 goto utf7Error;
888 } else {
889 *p++ = ch;
890 }
891 } else {
892 charsleft = (charsleft << 6) | UB64(ch);
893 bitsleft += 6;
894 s++;
895 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
896 }
897 }
898 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000899 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000900 s++;
901 if (s < e && *s == '-') {
902 s++;
903 *p++ = '+';
904 } else
905 {
906 inShift = 1;
907 bitsleft = 0;
908 }
909 }
910 else if (SPECIAL(ch,0,0)) {
911 errmsg = "unexpected special character";
912 s++;
913 goto utf7Error;
914 }
915 else {
916 *p++ = ch;
917 s++;
918 }
919 continue;
920 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000921 outpos = p-PyUnicode_AS_UNICODE(unicode);
922 endinpos = s-starts;
923 if (unicode_decode_call_errorhandler(
924 errors, &errorHandler,
925 "utf7", errmsg,
926 starts, size, &startinpos, &endinpos, &exc, &s,
927 (PyObject **)&unicode, &outpos, &p))
928 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000929 }
930
931 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000932 outpos = p-PyUnicode_AS_UNICODE(unicode);
933 endinpos = size;
934 if (unicode_decode_call_errorhandler(
935 errors, &errorHandler,
936 "utf7", "unterminated shift sequence",
937 starts, size, &startinpos, &endinpos, &exc, &s,
938 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000939 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000940 if (s < e)
941 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942 }
943
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000944 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000945 goto onError;
946
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000947 Py_XDECREF(errorHandler);
948 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 return (PyObject *)unicode;
950
951onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000952 Py_XDECREF(errorHandler);
953 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000954 Py_DECREF(unicode);
955 return NULL;
956}
957
958
959PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
960 int size,
961 int encodeSetO,
962 int encodeWhiteSpace,
963 const char *errors)
964{
965 PyObject *v;
966 /* It might be possible to tighten this worst case */
967 unsigned int cbAllocated = 5 * size;
968 int inShift = 0;
969 int i = 0;
970 unsigned int bitsleft = 0;
971 unsigned long charsleft = 0;
972 char * out;
973 char * start;
974
975 if (size == 0)
976 return PyString_FromStringAndSize(NULL, 0);
977
978 v = PyString_FromStringAndSize(NULL, cbAllocated);
979 if (v == NULL)
980 return NULL;
981
982 start = out = PyString_AS_STRING(v);
983 for (;i < size; ++i) {
984 Py_UNICODE ch = s[i];
985
986 if (!inShift) {
987 if (ch == '+') {
988 *out++ = '+';
989 *out++ = '-';
990 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
991 charsleft = ch;
992 bitsleft = 16;
993 *out++ = '+';
994 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
995 inShift = bitsleft > 0;
996 } else {
997 *out++ = (char) ch;
998 }
999 } else {
1000 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1001 *out++ = B64(charsleft << (6-bitsleft));
1002 charsleft = 0;
1003 bitsleft = 0;
1004 /* Characters not in the BASE64 set implicitly unshift the sequence
1005 so no '-' is required, except if the character is itself a '-' */
1006 if (B64CHAR(ch) || ch == '-') {
1007 *out++ = '-';
1008 }
1009 inShift = 0;
1010 *out++ = (char) ch;
1011 } else {
1012 bitsleft += 16;
1013 charsleft = (charsleft << 16) | ch;
1014 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1015
1016 /* If the next character is special then we dont' need to terminate
1017 the shift sequence. If the next character is not a BASE64 character
1018 or '-' then the shift sequence will be terminated implicitly and we
1019 don't have to insert a '-'. */
1020
1021 if (bitsleft == 0) {
1022 if (i + 1 < size) {
1023 Py_UNICODE ch2 = s[i+1];
1024
1025 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1026
1027 } else if (B64CHAR(ch2) || ch2 == '-') {
1028 *out++ = '-';
1029 inShift = 0;
1030 } else {
1031 inShift = 0;
1032 }
1033
1034 }
1035 else {
1036 *out++ = '-';
1037 inShift = 0;
1038 }
1039 }
1040 }
1041 }
1042 }
1043 if (bitsleft) {
1044 *out++= B64(charsleft << (6-bitsleft) );
1045 *out++ = '-';
1046 }
1047
Tim Peters5de98422002-04-27 18:44:32 +00001048 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001049 return v;
1050}
1051
1052#undef SPECIAL
1053#undef B64
1054#undef B64CHAR
1055#undef UB64
1056#undef ENCODE
1057#undef DECODE
1058
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059/* --- UTF-8 Codec -------------------------------------------------------- */
1060
1061static
1062char utf8_code_length[256] = {
1063 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1064 illegal prefix. see RFC 2279 for details */
1065 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1066 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1067 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1068 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1069 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1070 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1071 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1072 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1073 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1074 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1075 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1076 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1077 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1078 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1079 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1080 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1081};
1082
Guido van Rossumd57fd912000-03-10 22:53:23 +00001083PyObject *PyUnicode_DecodeUTF8(const char *s,
1084 int size,
1085 const char *errors)
1086{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001087 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001089 int startinpos;
1090 int endinpos;
1091 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092 const char *e;
1093 PyUnicodeObject *unicode;
1094 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001095 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001096 PyObject *errorHandler = NULL;
1097 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098
1099 /* Note: size will always be longer than the resulting Unicode
1100 character count */
1101 unicode = _PyUnicode_New(size);
1102 if (!unicode)
1103 return NULL;
1104 if (size == 0)
1105 return (PyObject *)unicode;
1106
1107 /* Unpack UTF-8 encoded data */
1108 p = unicode->str;
1109 e = s + size;
1110
1111 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001112 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113
1114 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001115 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116 s++;
1117 continue;
1118 }
1119
1120 n = utf8_code_length[ch];
1121
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001122 if (s + n > e) {
1123 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001124 startinpos = s-starts;
1125 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001126 goto utf8Error;
1127 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128
1129 switch (n) {
1130
1131 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001132 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001133 startinpos = s-starts;
1134 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001135 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001136
1137 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001138 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001139 startinpos = s-starts;
1140 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001141 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001142
1143 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001144 if ((s[1] & 0xc0) != 0x80) {
1145 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001146 startinpos = s-starts;
1147 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001148 goto utf8Error;
1149 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001151 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001152 startinpos = s-starts;
1153 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001154 errmsg = "illegal encoding";
1155 goto utf8Error;
1156 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001157 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001158 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159 break;
1160
1161 case 3:
1162 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001163 (s[2] & 0xc0) != 0x80) {
1164 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001165 startinpos = s-starts;
1166 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001167 goto utf8Error;
1168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001170 if (ch < 0x0800) {
1171 /* Note: UTF-8 encodings of surrogates are considered
1172 legal UTF-8 sequences;
1173
1174 XXX For wide builds (UCS-4) we should probably try
1175 to recombine the surrogates into a single code
1176 unit.
1177 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001178 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001179 startinpos = s-starts;
1180 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001181 goto utf8Error;
1182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001184 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001185 break;
1186
1187 case 4:
1188 if ((s[1] & 0xc0) != 0x80 ||
1189 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001190 (s[3] & 0xc0) != 0x80) {
1191 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001192 startinpos = s-starts;
1193 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001194 goto utf8Error;
1195 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001196 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1197 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1198 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001199 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001200 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001201 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001202 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001203 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001204 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001205 startinpos = s-starts;
1206 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001207 goto utf8Error;
1208 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001209#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001210 *p++ = (Py_UNICODE)ch;
1211#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001212 /* compute and append the two surrogates: */
1213
1214 /* translate from 10000..10FFFF to 0..FFFF */
1215 ch -= 0x10000;
1216
1217 /* high surrogate = top 10 bits added to D800 */
1218 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1219
1220 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001221 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001222#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223 break;
1224
1225 default:
1226 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001227 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001228 startinpos = s-starts;
1229 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001230 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 }
1232 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001233 continue;
1234
1235 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001236 outpos = p-PyUnicode_AS_UNICODE(unicode);
1237 if (unicode_decode_call_errorhandler(
1238 errors, &errorHandler,
1239 "utf8", errmsg,
1240 starts, size, &startinpos, &endinpos, &exc, &s,
1241 (PyObject **)&unicode, &outpos, &p))
1242 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 }
1244
1245 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001246 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 goto onError;
1248
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001249 Py_XDECREF(errorHandler);
1250 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 return (PyObject *)unicode;
1252
1253onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001254 Py_XDECREF(errorHandler);
1255 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 Py_DECREF(unicode);
1257 return NULL;
1258}
1259
Tim Peters602f7402002-04-27 18:03:26 +00001260/* Allocation strategy: if the string is short, convert into a stack buffer
1261 and allocate exactly as much space needed at the end. Else allocate the
1262 maximum possible needed (4 result bytes per Unicode character), and return
1263 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001264*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001265PyObject *
1266PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1267 int size,
1268 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269{
Tim Peters602f7402002-04-27 18:03:26 +00001270#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001271
Tim Peters602f7402002-04-27 18:03:26 +00001272 int i; /* index into s of next input byte */
1273 PyObject *v; /* result string object */
1274 char *p; /* next free byte in output buffer */
1275 int nallocated; /* number of result bytes allocated */
1276 int nneeded; /* number of result bytes needed */
1277 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001278
Tim Peters602f7402002-04-27 18:03:26 +00001279 assert(s != NULL);
1280 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281
Tim Peters602f7402002-04-27 18:03:26 +00001282 if (size <= MAX_SHORT_UNICHARS) {
1283 /* Write into the stack buffer; nallocated can't overflow.
1284 * At the end, we'll allocate exactly as much heap space as it
1285 * turns out we need.
1286 */
1287 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1288 v = NULL; /* will allocate after we're done */
1289 p = stackbuf;
1290 }
1291 else {
1292 /* Overallocate on the heap, and give the excess back at the end. */
1293 nallocated = size * 4;
1294 if (nallocated / 4 != size) /* overflow! */
1295 return PyErr_NoMemory();
1296 v = PyString_FromStringAndSize(NULL, nallocated);
1297 if (v == NULL)
1298 return NULL;
1299 p = PyString_AS_STRING(v);
1300 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001301
Tim Peters602f7402002-04-27 18:03:26 +00001302 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001303 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001304
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001305 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001306 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001308
Guido van Rossumd57fd912000-03-10 22:53:23 +00001309 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001310 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001311 *p++ = (char)(0xc0 | (ch >> 6));
1312 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001313 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001314 else {
Tim Peters602f7402002-04-27 18:03:26 +00001315 /* Encode UCS2 Unicode ordinals */
1316 if (ch < 0x10000) {
1317 /* Special case: check for high surrogate */
1318 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1319 Py_UCS4 ch2 = s[i];
1320 /* Check for low surrogate and combine the two to
1321 form a UCS4 value */
1322 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001323 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001324 i++;
1325 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001326 }
Tim Peters602f7402002-04-27 18:03:26 +00001327 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001328 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001329 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001330 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1331 *p++ = (char)(0x80 | (ch & 0x3f));
1332 continue;
1333 }
1334encodeUCS4:
1335 /* Encode UCS4 Unicode ordinals */
1336 *p++ = (char)(0xf0 | (ch >> 18));
1337 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1338 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1339 *p++ = (char)(0x80 | (ch & 0x3f));
1340 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001341 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001342
Tim Peters602f7402002-04-27 18:03:26 +00001343 if (v == NULL) {
1344 /* This was stack allocated. */
1345 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1346 assert(nneeded <= nallocated);
1347 v = PyString_FromStringAndSize(stackbuf, nneeded);
1348 }
1349 else {
1350 /* Cut back to size actually needed. */
1351 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1352 assert(nneeded <= nallocated);
1353 _PyString_Resize(&v, nneeded);
1354 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001356
Tim Peters602f7402002-04-27 18:03:26 +00001357#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358}
1359
Guido van Rossumd57fd912000-03-10 22:53:23 +00001360PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1361{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362 if (!PyUnicode_Check(unicode)) {
1363 PyErr_BadArgument();
1364 return NULL;
1365 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001366 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1367 PyUnicode_GET_SIZE(unicode),
1368 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001369}
1370
1371/* --- UTF-16 Codec ------------------------------------------------------- */
1372
Tim Peters772747b2001-08-09 22:21:55 +00001373PyObject *
1374PyUnicode_DecodeUTF16(const char *s,
1375 int size,
1376 const char *errors,
1377 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001379 const char *starts = s;
1380 int startinpos;
1381 int endinpos;
1382 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001383 PyUnicodeObject *unicode;
1384 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001385 const unsigned char *q, *e;
1386 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001387 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001388 /* Offsets from q for retrieving byte pairs in the right order. */
1389#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1390 int ihi = 1, ilo = 0;
1391#else
1392 int ihi = 0, ilo = 1;
1393#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001394 PyObject *errorHandler = NULL;
1395 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001396
1397 /* Note: size will always be longer than the resulting Unicode
1398 character count */
1399 unicode = _PyUnicode_New(size);
1400 if (!unicode)
1401 return NULL;
1402 if (size == 0)
1403 return (PyObject *)unicode;
1404
1405 /* Unpack UTF-16 encoded data */
1406 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001407 q = (unsigned char *)s;
1408 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409
1410 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001411 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001413 /* Check for BOM marks (U+FEFF) in the input and adjust current
1414 byte order setting accordingly. In native mode, the leading BOM
1415 mark is skipped, in all other modes, it is copied to the output
1416 stream as-is (giving a ZWNBSP character). */
1417 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001418 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001419#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001420 if (bom == 0xFEFF) {
1421 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001422 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001423 }
1424 else if (bom == 0xFFFE) {
1425 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001426 bo = 1;
1427 }
1428#else
Tim Peters772747b2001-08-09 22:21:55 +00001429 if (bom == 0xFEFF) {
1430 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001431 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001432 }
1433 else if (bom == 0xFFFE) {
1434 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001435 bo = -1;
1436 }
1437#endif
1438 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001439
Tim Peters772747b2001-08-09 22:21:55 +00001440 if (bo == -1) {
1441 /* force LE */
1442 ihi = 1;
1443 ilo = 0;
1444 }
1445 else if (bo == 1) {
1446 /* force BE */
1447 ihi = 0;
1448 ilo = 1;
1449 }
1450
1451 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001452 Py_UNICODE ch;
1453 /* remaing bytes at the end? (size should be even) */
1454 if (e-q<2) {
1455 errmsg = "truncated data";
1456 startinpos = ((const char *)q)-starts;
1457 endinpos = ((const char *)e)-starts;
1458 goto utf16Error;
1459 /* The remaining input chars are ignored if the callback
1460 chooses to skip the input */
1461 }
1462 ch = (q[ihi] << 8) | q[ilo];
1463
Tim Peters772747b2001-08-09 22:21:55 +00001464 q += 2;
1465
Guido van Rossumd57fd912000-03-10 22:53:23 +00001466 if (ch < 0xD800 || ch > 0xDFFF) {
1467 *p++ = ch;
1468 continue;
1469 }
1470
1471 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001472 if (q >= e) {
1473 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001474 startinpos = (((const char *)q)-2)-starts;
1475 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001476 goto utf16Error;
1477 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001478 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001479 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1480 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001481 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001482#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001483 *p++ = ch;
1484 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001485#else
1486 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001487#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001488 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001489 }
1490 else {
1491 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001492 startinpos = (((const char *)q)-4)-starts;
1493 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001494 goto utf16Error;
1495 }
1496
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001498 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001499 startinpos = (((const char *)q)-2)-starts;
1500 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001501 /* Fall through to report the error */
1502
1503 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001504 outpos = p-PyUnicode_AS_UNICODE(unicode);
1505 if (unicode_decode_call_errorhandler(
1506 errors, &errorHandler,
1507 "utf16", errmsg,
1508 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1509 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001510 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001511 }
1512
1513 if (byteorder)
1514 *byteorder = bo;
1515
1516 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001517 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518 goto onError;
1519
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001520 Py_XDECREF(errorHandler);
1521 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001522 return (PyObject *)unicode;
1523
1524onError:
1525 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001526 Py_XDECREF(errorHandler);
1527 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001528 return NULL;
1529}
1530
Tim Peters772747b2001-08-09 22:21:55 +00001531PyObject *
1532PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1533 int size,
1534 const char *errors,
1535 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001536{
1537 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001538 unsigned char *p;
1539 int i, pairs;
1540 /* Offsets from p for storing byte pairs in the right order. */
1541#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1542 int ihi = 1, ilo = 0;
1543#else
1544 int ihi = 0, ilo = 1;
1545#endif
1546
1547#define STORECHAR(CH) \
1548 do { \
1549 p[ihi] = ((CH) >> 8) & 0xff; \
1550 p[ilo] = (CH) & 0xff; \
1551 p += 2; \
1552 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001553
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001554 for (i = pairs = 0; i < size; i++)
1555 if (s[i] >= 0x10000)
1556 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001557 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001558 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001559 if (v == NULL)
1560 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561
Tim Peters772747b2001-08-09 22:21:55 +00001562 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001564 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001565 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001566 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001567
1568 if (byteorder == -1) {
1569 /* force LE */
1570 ihi = 1;
1571 ilo = 0;
1572 }
1573 else if (byteorder == 1) {
1574 /* force BE */
1575 ihi = 0;
1576 ilo = 1;
1577 }
1578
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001579 while (size-- > 0) {
1580 Py_UNICODE ch = *s++;
1581 Py_UNICODE ch2 = 0;
1582 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001583 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1584 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001585 }
Tim Peters772747b2001-08-09 22:21:55 +00001586 STORECHAR(ch);
1587 if (ch2)
1588 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001589 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001590 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001591#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592}
1593
1594PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1595{
1596 if (!PyUnicode_Check(unicode)) {
1597 PyErr_BadArgument();
1598 return NULL;
1599 }
1600 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1601 PyUnicode_GET_SIZE(unicode),
1602 NULL,
1603 0);
1604}
1605
1606/* --- Unicode Escape Codec ----------------------------------------------- */
1607
Fredrik Lundh06d12682001-01-24 07:59:11 +00001608static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001609
Guido van Rossumd57fd912000-03-10 22:53:23 +00001610PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1611 int size,
1612 const char *errors)
1613{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001614 const char *starts = s;
1615 int startinpos;
1616 int endinpos;
1617 int outpos;
1618 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001620 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001621 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001622 char* message;
1623 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001624 PyObject *errorHandler = NULL;
1625 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001626
Guido van Rossumd57fd912000-03-10 22:53:23 +00001627 /* Escaped strings will always be longer than the resulting
1628 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001629 length after conversion to the true value.
1630 (but if the error callback returns a long replacement string
1631 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001632 v = _PyUnicode_New(size);
1633 if (v == NULL)
1634 goto onError;
1635 if (size == 0)
1636 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001637
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001638 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001639 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001640
Guido van Rossumd57fd912000-03-10 22:53:23 +00001641 while (s < end) {
1642 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001643 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001644 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001645
1646 /* Non-escape characters are interpreted as Unicode ordinals */
1647 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001648 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001649 continue;
1650 }
1651
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001652 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001653 /* \ - Escapes */
1654 s++;
1655 switch (*s++) {
1656
1657 /* \x escapes */
1658 case '\n': break;
1659 case '\\': *p++ = '\\'; break;
1660 case '\'': *p++ = '\''; break;
1661 case '\"': *p++ = '\"'; break;
1662 case 'b': *p++ = '\b'; break;
1663 case 'f': *p++ = '\014'; break; /* FF */
1664 case 't': *p++ = '\t'; break;
1665 case 'n': *p++ = '\n'; break;
1666 case 'r': *p++ = '\r'; break;
1667 case 'v': *p++ = '\013'; break; /* VT */
1668 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1669
1670 /* \OOO (octal) escapes */
1671 case '0': case '1': case '2': case '3':
1672 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001673 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001675 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001676 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001677 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001678 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001679 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001680 break;
1681
Fredrik Lundhccc74732001-02-18 22:13:49 +00001682 /* hex escapes */
1683 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001685 digits = 2;
1686 message = "truncated \\xXX escape";
1687 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001688
Fredrik Lundhccc74732001-02-18 22:13:49 +00001689 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001690 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001691 digits = 4;
1692 message = "truncated \\uXXXX escape";
1693 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694
Fredrik Lundhccc74732001-02-18 22:13:49 +00001695 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001696 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001697 digits = 8;
1698 message = "truncated \\UXXXXXXXX escape";
1699 hexescape:
1700 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001701 outpos = p-PyUnicode_AS_UNICODE(v);
1702 if (s+digits>end) {
1703 endinpos = size;
1704 if (unicode_decode_call_errorhandler(
1705 errors, &errorHandler,
1706 "unicodeescape", "end of string in escape sequence",
1707 starts, size, &startinpos, &endinpos, &exc, &s,
1708 (PyObject **)&v, &outpos, &p))
1709 goto onError;
1710 goto nextByte;
1711 }
1712 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001713 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001714 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001715 endinpos = (s+i+1)-starts;
1716 if (unicode_decode_call_errorhandler(
1717 errors, &errorHandler,
1718 "unicodeescape", message,
1719 starts, size, &startinpos, &endinpos, &exc, &s,
1720 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001721 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001722 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001723 }
1724 chr = (chr<<4) & ~0xF;
1725 if (c >= '0' && c <= '9')
1726 chr += c - '0';
1727 else if (c >= 'a' && c <= 'f')
1728 chr += 10 + c - 'a';
1729 else
1730 chr += 10 + c - 'A';
1731 }
1732 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001733 if (chr == 0xffffffff)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001734 /* _decoding_error will have already written into the
1735 target buffer. */
1736 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001737 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001738 /* when we get here, chr is a 32-bit unicode character */
1739 if (chr <= 0xffff)
1740 /* UCS-2 character */
1741 *p++ = (Py_UNICODE) chr;
1742 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001743 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001744 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001745#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001746 *p++ = chr;
1747#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001748 chr -= 0x10000L;
1749 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001750 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001751#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001752 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001753 endinpos = s-starts;
1754 outpos = p-PyUnicode_AS_UNICODE(v);
1755 if (unicode_decode_call_errorhandler(
1756 errors, &errorHandler,
1757 "unicodeescape", "illegal Unicode character",
1758 starts, size, &startinpos, &endinpos, &exc, &s,
1759 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001760 goto onError;
1761 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001762 break;
1763
1764 /* \N{name} */
1765 case 'N':
1766 message = "malformed \\N character escape";
1767 if (ucnhash_CAPI == NULL) {
1768 /* load the unicode data module */
1769 PyObject *m, *v;
1770 m = PyImport_ImportModule("unicodedata");
1771 if (m == NULL)
1772 goto ucnhashError;
1773 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1774 Py_DECREF(m);
1775 if (v == NULL)
1776 goto ucnhashError;
1777 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1778 Py_DECREF(v);
1779 if (ucnhash_CAPI == NULL)
1780 goto ucnhashError;
1781 }
1782 if (*s == '{') {
1783 const char *start = s+1;
1784 /* look for the closing brace */
1785 while (*s != '}' && s < end)
1786 s++;
1787 if (s > start && s < end && *s == '}') {
1788 /* found a name. look it up in the unicode database */
1789 message = "unknown Unicode character name";
1790 s++;
1791 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1792 goto store;
1793 }
1794 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001795 endinpos = s-starts;
1796 outpos = p-PyUnicode_AS_UNICODE(v);
1797 if (unicode_decode_call_errorhandler(
1798 errors, &errorHandler,
1799 "unicodeescape", message,
1800 starts, size, &startinpos, &endinpos, &exc, &s,
1801 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001802 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001803 break;
1804
1805 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001806 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001807 message = "\\ at end of string";
1808 s--;
1809 endinpos = s-starts;
1810 outpos = p-PyUnicode_AS_UNICODE(v);
1811 if (unicode_decode_call_errorhandler(
1812 errors, &errorHandler,
1813 "unicodeescape", message,
1814 starts, size, &startinpos, &endinpos, &exc, &s,
1815 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001816 goto onError;
1817 }
1818 else {
1819 *p++ = '\\';
1820 *p++ = (unsigned char)s[-1];
1821 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001822 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001823 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001824 nextByte:
1825 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001826 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001827 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
1828 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001830
Fredrik Lundhccc74732001-02-18 22:13:49 +00001831ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001832 PyErr_SetString(
1833 PyExc_UnicodeError,
1834 "\\N escapes not supported (can't load unicodedata module)"
1835 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001836 Py_XDECREF(errorHandler);
1837 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001838 return NULL;
1839
Fredrik Lundhccc74732001-02-18 22:13:49 +00001840onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001842 Py_XDECREF(errorHandler);
1843 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 return NULL;
1845}
1846
1847/* Return a Unicode-Escape string version of the Unicode object.
1848
1849 If quotes is true, the string is enclosed in u"" or u'' quotes as
1850 appropriate.
1851
1852*/
1853
Barry Warsaw51ac5802000-03-20 16:36:48 +00001854static const Py_UNICODE *findchar(const Py_UNICODE *s,
1855 int size,
1856 Py_UNICODE ch);
1857
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858static
1859PyObject *unicodeescape_string(const Py_UNICODE *s,
1860 int size,
1861 int quotes)
1862{
1863 PyObject *repr;
1864 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001866 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001867
1868 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1869 if (repr == NULL)
1870 return NULL;
1871
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001872 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873
1874 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875 *p++ = 'u';
1876 *p++ = (findchar(s, size, '\'') &&
1877 !findchar(s, size, '"')) ? '"' : '\'';
1878 }
1879 while (size-- > 0) {
1880 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001881
Guido van Rossumd57fd912000-03-10 22:53:23 +00001882 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001883 if (quotes &&
1884 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001885 *p++ = '\\';
1886 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001887 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001888 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001889
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001890#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001891 /* Map 21-bit characters to '\U00xxxxxx' */
1892 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001893 int offset = p - PyString_AS_STRING(repr);
1894
1895 /* Resize the string if necessary */
1896 if (offset + 12 > PyString_GET_SIZE(repr)) {
1897 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001898 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001899 p = PyString_AS_STRING(repr) + offset;
1900 }
1901
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001902 *p++ = '\\';
1903 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001904 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1905 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1906 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1907 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1908 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1909 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1910 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001911 *p++ = hexdigit[ch & 0x0000000F];
1912 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001913 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001914#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001915 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1916 else if (ch >= 0xD800 && ch < 0xDC00) {
1917 Py_UNICODE ch2;
1918 Py_UCS4 ucs;
1919
1920 ch2 = *s++;
1921 size--;
1922 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1923 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1924 *p++ = '\\';
1925 *p++ = 'U';
1926 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1927 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1928 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1929 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1930 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1931 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1932 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1933 *p++ = hexdigit[ucs & 0x0000000F];
1934 continue;
1935 }
1936 /* Fall through: isolated surrogates are copied as-is */
1937 s--;
1938 size++;
1939 }
1940
Guido van Rossumd57fd912000-03-10 22:53:23 +00001941 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001942 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943 *p++ = '\\';
1944 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001945 *p++ = hexdigit[(ch >> 12) & 0x000F];
1946 *p++ = hexdigit[(ch >> 8) & 0x000F];
1947 *p++ = hexdigit[(ch >> 4) & 0x000F];
1948 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001950
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001951 /* Map special whitespace to '\t', \n', '\r' */
1952 else if (ch == '\t') {
1953 *p++ = '\\';
1954 *p++ = 't';
1955 }
1956 else if (ch == '\n') {
1957 *p++ = '\\';
1958 *p++ = 'n';
1959 }
1960 else if (ch == '\r') {
1961 *p++ = '\\';
1962 *p++ = 'r';
1963 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001964
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001965 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001966 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001968 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001969 *p++ = hexdigit[(ch >> 4) & 0x000F];
1970 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001972
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973 /* Copy everything else as-is */
1974 else
1975 *p++ = (char) ch;
1976 }
1977 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001978 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979
1980 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00001981 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982 return repr;
1983}
1984
1985PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1986 int size)
1987{
1988 return unicodeescape_string(s, size, 0);
1989}
1990
1991PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1992{
1993 if (!PyUnicode_Check(unicode)) {
1994 PyErr_BadArgument();
1995 return NULL;
1996 }
1997 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1998 PyUnicode_GET_SIZE(unicode));
1999}
2000
2001/* --- Raw Unicode Escape Codec ------------------------------------------- */
2002
2003PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2004 int size,
2005 const char *errors)
2006{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002007 const char *starts = s;
2008 int startinpos;
2009 int endinpos;
2010 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002012 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013 const char *end;
2014 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002015 PyObject *errorHandler = NULL;
2016 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017
2018 /* Escaped strings will always be longer than the resulting
2019 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002020 length after conversion to the true value. (But decoding error
2021 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022 v = _PyUnicode_New(size);
2023 if (v == NULL)
2024 goto onError;
2025 if (size == 0)
2026 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002027 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002028 end = s + size;
2029 while (s < end) {
2030 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002031 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002032 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002033 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002034
2035 /* Non-escape characters are interpreted as Unicode ordinals */
2036 if (*s != '\\') {
2037 *p++ = (unsigned char)*s++;
2038 continue;
2039 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002040 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041
2042 /* \u-escapes are only interpreted iff the number of leading
2043 backslashes if odd */
2044 bs = s;
2045 for (;s < end;) {
2046 if (*s != '\\')
2047 break;
2048 *p++ = (unsigned char)*s++;
2049 }
2050 if (((s - bs) & 1) == 0 ||
2051 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002052 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 continue;
2054 }
2055 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002056 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057 s++;
2058
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002059 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002060 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002061 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002062 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002064 endinpos = s-starts;
2065 if (unicode_decode_call_errorhandler(
2066 errors, &errorHandler,
2067 "rawunicodeescape", "truncated \\uXXXX",
2068 starts, size, &startinpos, &endinpos, &exc, &s,
2069 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002071 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 }
2073 x = (x<<4) & ~0xF;
2074 if (c >= '0' && c <= '9')
2075 x += c - '0';
2076 else if (c >= 'a' && c <= 'f')
2077 x += 10 + c - 'a';
2078 else
2079 x += 10 + c - 'A';
2080 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002081#ifndef Py_UNICODE_WIDE
2082 if (x > 0x10000) {
2083 if (unicode_decode_call_errorhandler(
2084 errors, &errorHandler,
2085 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2086 starts, size, &startinpos, &endinpos, &exc, &s,
2087 (PyObject **)&v, &outpos, &p))
2088 goto onError;
2089 }
2090#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002091 *p++ = x;
2092 nextByte:
2093 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002095 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002096 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002097 Py_XDECREF(errorHandler);
2098 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099 return (PyObject *)v;
2100
2101 onError:
2102 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002103 Py_XDECREF(errorHandler);
2104 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002105 return NULL;
2106}
2107
2108PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2109 int size)
2110{
2111 PyObject *repr;
2112 char *p;
2113 char *q;
2114
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002115 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002117#ifdef Py_UNICODE_WIDE
2118 repr = PyString_FromStringAndSize(NULL, 10 * size);
2119#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002120 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002121#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002122 if (repr == NULL)
2123 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002124 if (size == 0)
2125 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126
2127 p = q = PyString_AS_STRING(repr);
2128 while (size-- > 0) {
2129 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002130#ifdef Py_UNICODE_WIDE
2131 /* Map 32-bit characters to '\Uxxxxxxxx' */
2132 if (ch >= 0x10000) {
2133 *p++ = '\\';
2134 *p++ = 'U';
2135 *p++ = hexdigit[(ch >> 28) & 0xf];
2136 *p++ = hexdigit[(ch >> 24) & 0xf];
2137 *p++ = hexdigit[(ch >> 20) & 0xf];
2138 *p++ = hexdigit[(ch >> 16) & 0xf];
2139 *p++ = hexdigit[(ch >> 12) & 0xf];
2140 *p++ = hexdigit[(ch >> 8) & 0xf];
2141 *p++ = hexdigit[(ch >> 4) & 0xf];
2142 *p++ = hexdigit[ch & 15];
2143 }
2144 else
2145#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146 /* Map 16-bit characters to '\uxxxx' */
2147 if (ch >= 256) {
2148 *p++ = '\\';
2149 *p++ = 'u';
2150 *p++ = hexdigit[(ch >> 12) & 0xf];
2151 *p++ = hexdigit[(ch >> 8) & 0xf];
2152 *p++ = hexdigit[(ch >> 4) & 0xf];
2153 *p++ = hexdigit[ch & 15];
2154 }
2155 /* Copy everything else as-is */
2156 else
2157 *p++ = (char) ch;
2158 }
2159 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002160 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161 return repr;
2162}
2163
2164PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2165{
2166 if (!PyUnicode_Check(unicode)) {
2167 PyErr_BadArgument();
2168 return NULL;
2169 }
2170 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2171 PyUnicode_GET_SIZE(unicode));
2172}
2173
2174/* --- Latin-1 Codec ------------------------------------------------------ */
2175
2176PyObject *PyUnicode_DecodeLatin1(const char *s,
2177 int size,
2178 const char *errors)
2179{
2180 PyUnicodeObject *v;
2181 Py_UNICODE *p;
2182
2183 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002184 if (size == 1 && *(unsigned char*)s < 256) {
2185 Py_UNICODE r = *(unsigned char*)s;
2186 return PyUnicode_FromUnicode(&r, 1);
2187 }
2188
Guido van Rossumd57fd912000-03-10 22:53:23 +00002189 v = _PyUnicode_New(size);
2190 if (v == NULL)
2191 goto onError;
2192 if (size == 0)
2193 return (PyObject *)v;
2194 p = PyUnicode_AS_UNICODE(v);
2195 while (size-- > 0)
2196 *p++ = (unsigned char)*s++;
2197 return (PyObject *)v;
2198
2199 onError:
2200 Py_XDECREF(v);
2201 return NULL;
2202}
2203
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002204/* create or adjust a UnicodeEncodeError */
2205static void make_encode_exception(PyObject **exceptionObject,
2206 const char *encoding,
2207 const Py_UNICODE *unicode, int size,
2208 int startpos, int endpos,
2209 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002211 if (*exceptionObject == NULL) {
2212 *exceptionObject = PyUnicodeEncodeError_Create(
2213 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214 }
2215 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002216 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2217 goto onError;
2218 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2219 goto onError;
2220 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2221 goto onError;
2222 return;
2223 onError:
2224 Py_DECREF(*exceptionObject);
2225 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226 }
2227}
2228
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002229/* raises a UnicodeEncodeError */
2230static void raise_encode_exception(PyObject **exceptionObject,
2231 const char *encoding,
2232 const Py_UNICODE *unicode, int size,
2233 int startpos, int endpos,
2234 const char *reason)
2235{
2236 make_encode_exception(exceptionObject,
2237 encoding, unicode, size, startpos, endpos, reason);
2238 if (*exceptionObject != NULL)
2239 PyCodec_StrictErrors(*exceptionObject);
2240}
2241
2242/* error handling callback helper:
2243 build arguments, call the callback and check the arguments,
2244 put the result into newpos and return the replacement string, which
2245 has to be freed by the caller */
2246static PyObject *unicode_encode_call_errorhandler(const char *errors,
2247 PyObject **errorHandler,
2248 const char *encoding, const char *reason,
2249 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2250 int startpos, int endpos,
2251 int *newpos)
2252{
2253 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2254
2255 PyObject *restuple;
2256 PyObject *resunicode;
2257
2258 if (*errorHandler == NULL) {
2259 *errorHandler = PyCodec_LookupError(errors);
2260 if (*errorHandler == NULL)
2261 return NULL;
2262 }
2263
2264 make_encode_exception(exceptionObject,
2265 encoding, unicode, size, startpos, endpos, reason);
2266 if (*exceptionObject == NULL)
2267 return NULL;
2268
2269 restuple = PyObject_CallFunctionObjArgs(
2270 *errorHandler, *exceptionObject, NULL);
2271 if (restuple == NULL)
2272 return NULL;
2273 if (!PyTuple_Check(restuple)) {
2274 PyErr_Format(PyExc_TypeError, &argparse[4]);
2275 Py_DECREF(restuple);
2276 return NULL;
2277 }
2278 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2279 &resunicode, newpos)) {
2280 Py_DECREF(restuple);
2281 return NULL;
2282 }
2283 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002284 *newpos = size+*newpos;
2285 if (*newpos<0 || *newpos>size) {
2286 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2287 Py_DECREF(restuple);
2288 return NULL;
2289 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002290 Py_INCREF(resunicode);
2291 Py_DECREF(restuple);
2292 return resunicode;
2293}
2294
2295static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2296 int size,
2297 const char *errors,
2298 int limit)
2299{
2300 /* output object */
2301 PyObject *res;
2302 /* pointers to the beginning and end+1 of input */
2303 const Py_UNICODE *startp = p;
2304 const Py_UNICODE *endp = p + size;
2305 /* pointer to the beginning of the unencodable characters */
2306 /* const Py_UNICODE *badp = NULL; */
2307 /* pointer into the output */
2308 char *str;
2309 /* current output position */
2310 int respos = 0;
2311 int ressize;
2312 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2313 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2314 PyObject *errorHandler = NULL;
2315 PyObject *exc = NULL;
2316 /* the following variable is used for caching string comparisons
2317 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2318 int known_errorHandler = -1;
2319
2320 /* allocate enough for a simple encoding without
2321 replacements, if we need more, we'll resize */
2322 res = PyString_FromStringAndSize(NULL, size);
2323 if (res == NULL)
2324 goto onError;
2325 if (size == 0)
2326 return res;
2327 str = PyString_AS_STRING(res);
2328 ressize = size;
2329
2330 while (p<endp) {
2331 Py_UNICODE c = *p;
2332
2333 /* can we encode this? */
2334 if (c<limit) {
2335 /* no overflow check, because we know that the space is enough */
2336 *str++ = (char)c;
2337 ++p;
2338 }
2339 else {
2340 int unicodepos = p-startp;
2341 int requiredsize;
2342 PyObject *repunicode;
2343 int repsize;
2344 int newpos;
2345 int respos;
2346 Py_UNICODE *uni2;
2347 /* startpos for collecting unencodable chars */
2348 const Py_UNICODE *collstart = p;
2349 const Py_UNICODE *collend = p;
2350 /* find all unecodable characters */
2351 while ((collend < endp) && ((*collend)>=limit))
2352 ++collend;
2353 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2354 if (known_errorHandler==-1) {
2355 if ((errors==NULL) || (!strcmp(errors, "strict")))
2356 known_errorHandler = 1;
2357 else if (!strcmp(errors, "replace"))
2358 known_errorHandler = 2;
2359 else if (!strcmp(errors, "ignore"))
2360 known_errorHandler = 3;
2361 else if (!strcmp(errors, "xmlcharrefreplace"))
2362 known_errorHandler = 4;
2363 else
2364 known_errorHandler = 0;
2365 }
2366 switch (known_errorHandler) {
2367 case 1: /* strict */
2368 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2369 goto onError;
2370 case 2: /* replace */
2371 while (collstart++<collend)
2372 *str++ = '?'; /* fall through */
2373 case 3: /* ignore */
2374 p = collend;
2375 break;
2376 case 4: /* xmlcharrefreplace */
2377 respos = str-PyString_AS_STRING(res);
2378 /* determine replacement size (temporarily (mis)uses p) */
2379 for (p = collstart, repsize = 0; p < collend; ++p) {
2380 if (*p<10)
2381 repsize += 2+1+1;
2382 else if (*p<100)
2383 repsize += 2+2+1;
2384 else if (*p<1000)
2385 repsize += 2+3+1;
2386 else if (*p<10000)
2387 repsize += 2+4+1;
2388 else if (*p<100000)
2389 repsize += 2+5+1;
2390 else if (*p<1000000)
2391 repsize += 2+6+1;
2392 else
2393 repsize += 2+7+1;
2394 }
2395 requiredsize = respos+repsize+(endp-collend);
2396 if (requiredsize > ressize) {
2397 if (requiredsize<2*ressize)
2398 requiredsize = 2*ressize;
2399 if (_PyString_Resize(&res, requiredsize))
2400 goto onError;
2401 str = PyString_AS_STRING(res) + respos;
2402 ressize = requiredsize;
2403 }
2404 /* generate replacement (temporarily (mis)uses p) */
2405 for (p = collstart; p < collend; ++p) {
2406 str += sprintf(str, "&#%d;", (int)*p);
2407 }
2408 p = collend;
2409 break;
2410 default:
2411 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2412 encoding, reason, startp, size, &exc,
2413 collstart-startp, collend-startp, &newpos);
2414 if (repunicode == NULL)
2415 goto onError;
2416 /* need more space? (at least enough for what we
2417 have+the replacement+the rest of the string, so
2418 we won't have to check space for encodable characters) */
2419 respos = str-PyString_AS_STRING(res);
2420 repsize = PyUnicode_GET_SIZE(repunicode);
2421 requiredsize = respos+repsize+(endp-collend);
2422 if (requiredsize > ressize) {
2423 if (requiredsize<2*ressize)
2424 requiredsize = 2*ressize;
2425 if (_PyString_Resize(&res, requiredsize)) {
2426 Py_DECREF(repunicode);
2427 goto onError;
2428 }
2429 str = PyString_AS_STRING(res) + respos;
2430 ressize = requiredsize;
2431 }
2432 /* check if there is anything unencodable in the replacement
2433 and copy it to the output */
2434 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2435 c = *uni2;
2436 if (c >= limit) {
2437 raise_encode_exception(&exc, encoding, startp, size,
2438 unicodepos, unicodepos+1, reason);
2439 Py_DECREF(repunicode);
2440 goto onError;
2441 }
2442 *str = (char)c;
2443 }
2444 p = startp + newpos;
2445 Py_DECREF(repunicode);
2446 }
2447 }
2448 }
2449 /* Resize if we allocated to much */
2450 respos = str-PyString_AS_STRING(res);
2451 if (respos<ressize)
2452 /* If this falls res will be NULL */
2453 _PyString_Resize(&res, respos);
2454 Py_XDECREF(errorHandler);
2455 Py_XDECREF(exc);
2456 return res;
2457
2458 onError:
2459 Py_XDECREF(res);
2460 Py_XDECREF(errorHandler);
2461 Py_XDECREF(exc);
2462 return NULL;
2463}
2464
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2466 int size,
2467 const char *errors)
2468{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002469 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002470}
2471
2472PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2473{
2474 if (!PyUnicode_Check(unicode)) {
2475 PyErr_BadArgument();
2476 return NULL;
2477 }
2478 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2479 PyUnicode_GET_SIZE(unicode),
2480 NULL);
2481}
2482
2483/* --- 7-bit ASCII Codec -------------------------------------------------- */
2484
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485PyObject *PyUnicode_DecodeASCII(const char *s,
2486 int size,
2487 const char *errors)
2488{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002489 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 PyUnicodeObject *v;
2491 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002492 int startinpos;
2493 int endinpos;
2494 int outpos;
2495 const char *e;
2496 PyObject *errorHandler = NULL;
2497 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498
2499 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002500 if (size == 1 && *(unsigned char*)s < 128) {
2501 Py_UNICODE r = *(unsigned char*)s;
2502 return PyUnicode_FromUnicode(&r, 1);
2503 }
2504
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505 v = _PyUnicode_New(size);
2506 if (v == NULL)
2507 goto onError;
2508 if (size == 0)
2509 return (PyObject *)v;
2510 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002511 e = s + size;
2512 while (s < e) {
2513 register unsigned char c = (unsigned char)*s;
2514 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002516 ++s;
2517 }
2518 else {
2519 startinpos = s-starts;
2520 endinpos = startinpos + 1;
2521 outpos = p-PyUnicode_AS_UNICODE(v);
2522 if (unicode_decode_call_errorhandler(
2523 errors, &errorHandler,
2524 "ascii", "ordinal not in range(128)",
2525 starts, size, &startinpos, &endinpos, &exc, &s,
2526 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002528 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002530 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002531 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002532 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002533 Py_XDECREF(errorHandler);
2534 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535 return (PyObject *)v;
2536
2537 onError:
2538 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002539 Py_XDECREF(errorHandler);
2540 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541 return NULL;
2542}
2543
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2545 int size,
2546 const char *errors)
2547{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002548 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549}
2550
2551PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2552{
2553 if (!PyUnicode_Check(unicode)) {
2554 PyErr_BadArgument();
2555 return NULL;
2556 }
2557 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2558 PyUnicode_GET_SIZE(unicode),
2559 NULL);
2560}
2561
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002562#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002563
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002564/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002565
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002566PyObject *PyUnicode_DecodeMBCS(const char *s,
2567 int size,
2568 const char *errors)
2569{
2570 PyUnicodeObject *v;
2571 Py_UNICODE *p;
2572
2573 /* First get the size of the result */
2574 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002575 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002576 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2577
2578 v = _PyUnicode_New(usize);
2579 if (v == NULL)
2580 return NULL;
2581 if (usize == 0)
2582 return (PyObject *)v;
2583 p = PyUnicode_AS_UNICODE(v);
2584 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2585 Py_DECREF(v);
2586 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2587 }
2588
2589 return (PyObject *)v;
2590}
2591
2592PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2593 int size,
2594 const char *errors)
2595{
2596 PyObject *repr;
2597 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002598 DWORD mbcssize;
2599
2600 /* If there are no characters, bail now! */
2601 if (size==0)
2602 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002603
2604 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002605 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002606 if (mbcssize==0)
2607 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2608
2609 repr = PyString_FromStringAndSize(NULL, mbcssize);
2610 if (repr == NULL)
2611 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002612 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002613 return repr;
2614
2615 /* Do the conversion */
2616 s = PyString_AS_STRING(repr);
2617 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2618 Py_DECREF(repr);
2619 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2620 }
2621 return repr;
2622}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002623
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002624#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002625
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626/* --- Character Mapping Codec -------------------------------------------- */
2627
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628PyObject *PyUnicode_DecodeCharmap(const char *s,
2629 int size,
2630 PyObject *mapping,
2631 const char *errors)
2632{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002633 const char *starts = s;
2634 int startinpos;
2635 int endinpos;
2636 int outpos;
2637 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002638 PyUnicodeObject *v;
2639 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002640 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002641 PyObject *errorHandler = NULL;
2642 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643
2644 /* Default to Latin-1 */
2645 if (mapping == NULL)
2646 return PyUnicode_DecodeLatin1(s, size, errors);
2647
2648 v = _PyUnicode_New(size);
2649 if (v == NULL)
2650 goto onError;
2651 if (size == 0)
2652 return (PyObject *)v;
2653 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002654 e = s + size;
2655 while (s < e) {
2656 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002657 PyObject *w, *x;
2658
2659 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2660 w = PyInt_FromLong((long)ch);
2661 if (w == NULL)
2662 goto onError;
2663 x = PyObject_GetItem(mapping, w);
2664 Py_DECREF(w);
2665 if (x == NULL) {
2666 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002667 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002669 x = Py_None;
2670 Py_INCREF(x);
2671 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002672 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002673 }
2674
2675 /* Apply mapping */
2676 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002677 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678 if (value < 0 || value > 65535) {
2679 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002680 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681 Py_DECREF(x);
2682 goto onError;
2683 }
2684 *p++ = (Py_UNICODE)value;
2685 }
2686 else if (x == Py_None) {
2687 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002688 outpos = p-PyUnicode_AS_UNICODE(v);
2689 startinpos = s-starts;
2690 endinpos = startinpos+1;
2691 if (unicode_decode_call_errorhandler(
2692 errors, &errorHandler,
2693 "charmap", "character maps to <undefined>",
2694 starts, size, &startinpos, &endinpos, &exc, &s,
2695 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 Py_DECREF(x);
2697 goto onError;
2698 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002699 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002700 }
2701 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002702 int targetsize = PyUnicode_GET_SIZE(x);
2703
2704 if (targetsize == 1)
2705 /* 1-1 mapping */
2706 *p++ = *PyUnicode_AS_UNICODE(x);
2707
2708 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002710 if (targetsize > extrachars) {
2711 /* resize first */
2712 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2713 int needed = (targetsize - extrachars) + \
2714 (targetsize << 2);
2715 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002716 if (_PyUnicode_Resize(&v,
2717 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002718 Py_DECREF(x);
2719 goto onError;
2720 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002721 p = PyUnicode_AS_UNICODE(v) + oldpos;
2722 }
2723 Py_UNICODE_COPY(p,
2724 PyUnicode_AS_UNICODE(x),
2725 targetsize);
2726 p += targetsize;
2727 extrachars -= targetsize;
2728 }
2729 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730 }
2731 else {
2732 /* wrong return value */
2733 PyErr_SetString(PyExc_TypeError,
2734 "character mapping must return integer, None or unicode");
2735 Py_DECREF(x);
2736 goto onError;
2737 }
2738 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002739 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740 }
2741 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002742 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 Py_XDECREF(errorHandler);
2745 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746 return (PyObject *)v;
2747
2748 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002749 Py_XDECREF(errorHandler);
2750 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 Py_XDECREF(v);
2752 return NULL;
2753}
2754
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002755/* Lookup the character ch in the mapping. If the character
2756 can't be found, Py_None is returned (or NULL, if another
2757 error occured). */
2758static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002760 PyObject *w = PyInt_FromLong((long)c);
2761 PyObject *x;
2762
2763 if (w == NULL)
2764 return NULL;
2765 x = PyObject_GetItem(mapping, w);
2766 Py_DECREF(w);
2767 if (x == NULL) {
2768 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2769 /* No mapping found means: mapping is undefined. */
2770 PyErr_Clear();
2771 x = Py_None;
2772 Py_INCREF(x);
2773 return x;
2774 } else
2775 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002777 else if (x == Py_None)
2778 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002779 else if (PyInt_Check(x)) {
2780 long value = PyInt_AS_LONG(x);
2781 if (value < 0 || value > 255) {
2782 PyErr_SetString(PyExc_TypeError,
2783 "character mapping must be in range(256)");
2784 Py_DECREF(x);
2785 return NULL;
2786 }
2787 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002789 else if (PyString_Check(x))
2790 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002792 /* wrong return value */
2793 PyErr_SetString(PyExc_TypeError,
2794 "character mapping must return integer, None or str");
2795 Py_DECREF(x);
2796 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797 }
2798}
2799
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002800/* lookup the character, put the result in the output string and adjust
2801 various state variables. Reallocate the output string if not enough
2802 space is available. Return a new reference to the object that
2803 was put in the output buffer, or Py_None, if the mapping was undefined
2804 (in which case no character was written) or NULL, if a
2805 reallocation error ocurred. The called must decref the result */
2806static
2807PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2808 PyObject **outobj, int *outpos)
2809{
2810 PyObject *rep = charmapencode_lookup(c, mapping);
2811
2812 if (rep==NULL)
2813 return NULL;
2814 else if (rep==Py_None)
2815 return rep;
2816 else {
2817 char *outstart = PyString_AS_STRING(*outobj);
2818 int outsize = PyString_GET_SIZE(*outobj);
2819 if (PyInt_Check(rep)) {
2820 int requiredsize = *outpos+1;
2821 if (outsize<requiredsize) {
2822 /* exponentially overallocate to minimize reallocations */
2823 if (requiredsize < 2*outsize)
2824 requiredsize = 2*outsize;
2825 if (_PyString_Resize(outobj, requiredsize)) {
2826 Py_DECREF(rep);
2827 return NULL;
2828 }
2829 outstart = PyString_AS_STRING(*outobj);
2830 }
2831 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2832 }
2833 else {
2834 const char *repchars = PyString_AS_STRING(rep);
2835 int repsize = PyString_GET_SIZE(rep);
2836 int requiredsize = *outpos+repsize;
2837 if (outsize<requiredsize) {
2838 /* exponentially overallocate to minimize reallocations */
2839 if (requiredsize < 2*outsize)
2840 requiredsize = 2*outsize;
2841 if (_PyString_Resize(outobj, requiredsize)) {
2842 Py_DECREF(rep);
2843 return NULL;
2844 }
2845 outstart = PyString_AS_STRING(*outobj);
2846 }
2847 memcpy(outstart + *outpos, repchars, repsize);
2848 *outpos += repsize;
2849 }
2850 }
2851 return rep;
2852}
2853
2854/* handle an error in PyUnicode_EncodeCharmap
2855 Return 0 on success, -1 on error */
2856static
2857int charmap_encoding_error(
2858 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2859 PyObject **exceptionObject,
2860 int *known_errorHandler, PyObject *errorHandler, const char *errors,
2861 PyObject **res, int *respos)
2862{
2863 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2864 int repsize;
2865 int newpos;
2866 Py_UNICODE *uni2;
2867 /* startpos for collecting unencodable chars */
2868 int collstartpos = *inpos;
2869 int collendpos = *inpos+1;
2870 int collpos;
2871 char *encoding = "charmap";
2872 char *reason = "character maps to <undefined>";
2873
2874 PyObject *x;
2875 /* find all unencodable characters */
2876 while (collendpos < size) {
2877 x = charmapencode_lookup(p[collendpos], mapping);
2878 if (x==NULL)
2879 return -1;
2880 else if (x!=Py_None) {
2881 Py_DECREF(x);
2882 break;
2883 }
2884 Py_DECREF(x);
2885 ++collendpos;
2886 }
2887 /* cache callback name lookup
2888 * (if not done yet, i.e. it's the first error) */
2889 if (*known_errorHandler==-1) {
2890 if ((errors==NULL) || (!strcmp(errors, "strict")))
2891 *known_errorHandler = 1;
2892 else if (!strcmp(errors, "replace"))
2893 *known_errorHandler = 2;
2894 else if (!strcmp(errors, "ignore"))
2895 *known_errorHandler = 3;
2896 else if (!strcmp(errors, "xmlcharrefreplace"))
2897 *known_errorHandler = 4;
2898 else
2899 *known_errorHandler = 0;
2900 }
2901 switch (*known_errorHandler) {
2902 case 1: /* strict */
2903 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2904 return -1;
2905 case 2: /* replace */
2906 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2907 x = charmapencode_output('?', mapping, res, respos);
2908 if (x==NULL) {
2909 return -1;
2910 }
2911 else if (x==Py_None) {
2912 Py_DECREF(x);
2913 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2914 return -1;
2915 }
2916 Py_DECREF(x);
2917 }
2918 /* fall through */
2919 case 3: /* ignore */
2920 *inpos = collendpos;
2921 break;
2922 case 4: /* xmlcharrefreplace */
2923 /* generate replacement (temporarily (mis)uses p) */
2924 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2925 char buffer[2+29+1+1];
2926 char *cp;
2927 sprintf(buffer, "&#%d;", (int)p[collpos]);
2928 for (cp = buffer; *cp; ++cp) {
2929 x = charmapencode_output(*cp, mapping, res, respos);
2930 if (x==NULL)
2931 return -1;
2932 else if (x==Py_None) {
2933 Py_DECREF(x);
2934 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2935 return -1;
2936 }
2937 Py_DECREF(x);
2938 }
2939 }
2940 *inpos = collendpos;
2941 break;
2942 default:
2943 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2944 encoding, reason, p, size, exceptionObject,
2945 collstartpos, collendpos, &newpos);
2946 if (repunicode == NULL)
2947 return -1;
2948 /* generate replacement */
2949 repsize = PyUnicode_GET_SIZE(repunicode);
2950 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2951 x = charmapencode_output(*uni2, mapping, res, respos);
2952 if (x==NULL) {
2953 Py_DECREF(repunicode);
2954 return -1;
2955 }
2956 else if (x==Py_None) {
2957 Py_DECREF(repunicode);
2958 Py_DECREF(x);
2959 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2960 return -1;
2961 }
2962 Py_DECREF(x);
2963 }
2964 *inpos = newpos;
2965 Py_DECREF(repunicode);
2966 }
2967 return 0;
2968}
2969
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2971 int size,
2972 PyObject *mapping,
2973 const char *errors)
2974{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002975 /* output object */
2976 PyObject *res = NULL;
2977 /* current input position */
2978 int inpos = 0;
2979 /* current output position */
2980 int respos = 0;
2981 PyObject *errorHandler = NULL;
2982 PyObject *exc = NULL;
2983 /* the following variable is used for caching string comparisons
2984 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
2985 * 3=ignore, 4=xmlcharrefreplace */
2986 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987
2988 /* Default to Latin-1 */
2989 if (mapping == NULL)
2990 return PyUnicode_EncodeLatin1(p, size, errors);
2991
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002992 /* allocate enough for a simple encoding without
2993 replacements, if we need more, we'll resize */
2994 res = PyString_FromStringAndSize(NULL, size);
2995 if (res == NULL)
2996 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002997 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002998 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003000 while (inpos<size) {
3001 /* try to encode it */
3002 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3003 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003004 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003005 if (x==Py_None) { /* unencodable character */
3006 if (charmap_encoding_error(p, size, &inpos, mapping,
3007 &exc,
3008 &known_errorHandler, errorHandler, errors,
3009 &res, &respos))
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003010 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003012 else
3013 /* done with this character => adjust input position */
3014 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015 Py_DECREF(x);
3016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003018 /* Resize if we allocated to much */
3019 if (respos<PyString_GET_SIZE(res)) {
3020 if (_PyString_Resize(&res, respos))
3021 goto onError;
3022 }
3023 Py_XDECREF(exc);
3024 Py_XDECREF(errorHandler);
3025 return res;
3026
3027 onError:
3028 Py_XDECREF(res);
3029 Py_XDECREF(exc);
3030 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031 return NULL;
3032}
3033
3034PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3035 PyObject *mapping)
3036{
3037 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3038 PyErr_BadArgument();
3039 return NULL;
3040 }
3041 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3042 PyUnicode_GET_SIZE(unicode),
3043 mapping,
3044 NULL);
3045}
3046
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003047/* create or adjust a UnicodeTranslateError */
3048static void make_translate_exception(PyObject **exceptionObject,
3049 const Py_UNICODE *unicode, int size,
3050 int startpos, int endpos,
3051 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 if (*exceptionObject == NULL) {
3054 *exceptionObject = PyUnicodeTranslateError_Create(
3055 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 }
3057 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003058 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3059 goto onError;
3060 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3061 goto onError;
3062 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3063 goto onError;
3064 return;
3065 onError:
3066 Py_DECREF(*exceptionObject);
3067 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003068 }
3069}
3070
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003071/* raises a UnicodeTranslateError */
3072static void raise_translate_exception(PyObject **exceptionObject,
3073 const Py_UNICODE *unicode, int size,
3074 int startpos, int endpos,
3075 const char *reason)
3076{
3077 make_translate_exception(exceptionObject,
3078 unicode, size, startpos, endpos, reason);
3079 if (*exceptionObject != NULL)
3080 PyCodec_StrictErrors(*exceptionObject);
3081}
3082
3083/* error handling callback helper:
3084 build arguments, call the callback and check the arguments,
3085 put the result into newpos and return the replacement string, which
3086 has to be freed by the caller */
3087static PyObject *unicode_translate_call_errorhandler(const char *errors,
3088 PyObject **errorHandler,
3089 const char *reason,
3090 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3091 int startpos, int endpos,
3092 int *newpos)
3093{
3094 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3095
3096 PyObject *restuple;
3097 PyObject *resunicode;
3098
3099 if (*errorHandler == NULL) {
3100 *errorHandler = PyCodec_LookupError(errors);
3101 if (*errorHandler == NULL)
3102 return NULL;
3103 }
3104
3105 make_translate_exception(exceptionObject,
3106 unicode, size, startpos, endpos, reason);
3107 if (*exceptionObject == NULL)
3108 return NULL;
3109
3110 restuple = PyObject_CallFunctionObjArgs(
3111 *errorHandler, *exceptionObject, NULL);
3112 if (restuple == NULL)
3113 return NULL;
3114 if (!PyTuple_Check(restuple)) {
3115 PyErr_Format(PyExc_TypeError, &argparse[4]);
3116 Py_DECREF(restuple);
3117 return NULL;
3118 }
3119 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3120 &resunicode, newpos)) {
3121 Py_DECREF(restuple);
3122 return NULL;
3123 }
3124 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003125 *newpos = size+*newpos;
3126 if (*newpos<0 || *newpos>size) {
3127 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3128 Py_DECREF(restuple);
3129 return NULL;
3130 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003131 Py_INCREF(resunicode);
3132 Py_DECREF(restuple);
3133 return resunicode;
3134}
3135
3136/* Lookup the character ch in the mapping and put the result in result,
3137 which must be decrefed by the caller.
3138 Return 0 on success, -1 on error */
3139static
3140int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3141{
3142 PyObject *w = PyInt_FromLong((long)c);
3143 PyObject *x;
3144
3145 if (w == NULL)
3146 return -1;
3147 x = PyObject_GetItem(mapping, w);
3148 Py_DECREF(w);
3149 if (x == NULL) {
3150 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3151 /* No mapping found means: use 1:1 mapping. */
3152 PyErr_Clear();
3153 *result = NULL;
3154 return 0;
3155 } else
3156 return -1;
3157 }
3158 else if (x == Py_None) {
3159 *result = x;
3160 return 0;
3161 }
3162 else if (PyInt_Check(x)) {
3163 long value = PyInt_AS_LONG(x);
3164 long max = PyUnicode_GetMax();
3165 if (value < 0 || value > max) {
3166 PyErr_Format(PyExc_TypeError,
3167 "character mapping must be in range(0x%lx)", max+1);
3168 Py_DECREF(x);
3169 return -1;
3170 }
3171 *result = x;
3172 return 0;
3173 }
3174 else if (PyUnicode_Check(x)) {
3175 *result = x;
3176 return 0;
3177 }
3178 else {
3179 /* wrong return value */
3180 PyErr_SetString(PyExc_TypeError,
3181 "character mapping must return integer, None or unicode");
3182 return -1;
3183 }
3184}
3185/* ensure that *outobj is at least requiredsize characters long,
3186if not reallocate and adjust various state variables.
3187Return 0 on success, -1 on error */
3188static
3189int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize,
3190 int requiredsize)
3191{
3192 if (requiredsize > *outsize) {
3193 /* remember old output position */
3194 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3195 /* exponentially overallocate to minimize reallocations */
3196 if (requiredsize < 2 * *outsize)
3197 requiredsize = 2 * *outsize;
3198 if (_PyUnicode_Resize(outobj, requiredsize))
3199 return -1;
3200 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3201 *outsize = requiredsize;
3202 }
3203 return 0;
3204}
3205/* lookup the character, put the result in the output string and adjust
3206 various state variables. Return a new reference to the object that
3207 was put in the output buffer in *result, or Py_None, if the mapping was
3208 undefined (in which case no character was written).
3209 The called must decref result.
3210 Return 0 on success, -1 on error. */
3211static
3212int charmaptranslate_output(Py_UNICODE c, PyObject *mapping,
3213 PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res)
3214{
3215 if (charmaptranslate_lookup(c, mapping, res))
3216 return -1;
3217 if (*res==NULL) {
3218 /* not found => default to 1:1 mapping */
3219 *(*outp)++ = (Py_UNICODE)c;
3220 }
3221 else if (*res==Py_None)
3222 ;
3223 else if (PyInt_Check(*res)) {
3224 /* no overflow check, because we know that the space is enough */
3225 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3226 }
3227 else if (PyUnicode_Check(*res)) {
3228 int repsize = PyUnicode_GET_SIZE(*res);
3229 if (repsize==1) {
3230 /* no overflow check, because we know that the space is enough */
3231 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3232 }
3233 else if (repsize!=0) {
3234 /* more than one character */
3235 int requiredsize = *outsize + repsize - 1;
3236 if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize))
3237 return -1;
3238 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3239 *outp += repsize;
3240 }
3241 }
3242 else
3243 return -1;
3244 return 0;
3245}
3246
3247PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248 int size,
3249 PyObject *mapping,
3250 const char *errors)
3251{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003252 /* output object */
3253 PyObject *res = NULL;
3254 /* pointers to the beginning and end+1 of input */
3255 const Py_UNICODE *startp = p;
3256 const Py_UNICODE *endp = p + size;
3257 /* pointer into the output */
3258 Py_UNICODE *str;
3259 /* current output position */
3260 int respos = 0;
3261 int ressize;
3262 char *reason = "character maps to <undefined>";
3263 PyObject *errorHandler = NULL;
3264 PyObject *exc = NULL;
3265 /* the following variable is used for caching string comparisons
3266 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3267 * 3=ignore, 4=xmlcharrefreplace */
3268 int known_errorHandler = -1;
3269
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270 if (mapping == NULL) {
3271 PyErr_BadArgument();
3272 return NULL;
3273 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003274
3275 /* allocate enough for a simple 1:1 translation without
3276 replacements, if we need more, we'll resize */
3277 res = PyUnicode_FromUnicode(NULL, size);
3278 if (res == NULL)
3279 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003280 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003281 return res;
3282 str = PyUnicode_AS_UNICODE(res);
3283 ressize = size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003284
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003285 while (p<endp) {
3286 /* try to encode it */
3287 PyObject *x = NULL;
3288 if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) {
3289 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290 goto onError;
3291 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003292 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003293 if (x!=Py_None) /* it worked => adjust input pointer */
3294 ++p;
3295 else { /* untranslatable character */
3296 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3297 int repsize;
3298 int newpos;
3299 Py_UNICODE *uni2;
3300 /* startpos for collecting untranslatable chars */
3301 const Py_UNICODE *collstart = p;
3302 const Py_UNICODE *collend = p+1;
3303 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003305 /* find all untranslatable characters */
3306 while (collend < endp) {
3307 if (charmaptranslate_lookup(*collend, mapping, &x))
3308 goto onError;
3309 Py_XDECREF(x);
3310 if (x!=Py_None)
3311 break;
3312 ++collend;
3313 }
3314 /* cache callback name lookup
3315 * (if not done yet, i.e. it's the first error) */
3316 if (known_errorHandler==-1) {
3317 if ((errors==NULL) || (!strcmp(errors, "strict")))
3318 known_errorHandler = 1;
3319 else if (!strcmp(errors, "replace"))
3320 known_errorHandler = 2;
3321 else if (!strcmp(errors, "ignore"))
3322 known_errorHandler = 3;
3323 else if (!strcmp(errors, "xmlcharrefreplace"))
3324 known_errorHandler = 4;
3325 else
3326 known_errorHandler = 0;
3327 }
3328 switch (known_errorHandler) {
3329 case 1: /* strict */
3330 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3331 goto onError;
3332 case 2: /* replace */
3333 /* No need to check for space, this is a 1:1 replacement */
3334 for (coll = collstart; coll<collend; ++coll)
3335 *str++ = '?';
3336 /* fall through */
3337 case 3: /* ignore */
3338 p = collend;
3339 break;
3340 case 4: /* xmlcharrefreplace */
3341 /* generate replacement (temporarily (mis)uses p) */
3342 for (p = collstart; p < collend; ++p) {
3343 char buffer[2+29+1+1];
3344 char *cp;
3345 sprintf(buffer, "&#%d;", (int)*p);
3346 if (charmaptranslate_makespace(&res, &str, &ressize,
3347 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3348 goto onError;
3349 for (cp = buffer; *cp; ++cp)
3350 *str++ = *cp;
3351 }
3352 p = collend;
3353 break;
3354 default:
3355 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3356 reason, startp, size, &exc,
3357 collstart-startp, collend-startp, &newpos);
3358 if (repunicode == NULL)
3359 goto onError;
3360 /* generate replacement */
3361 repsize = PyUnicode_GET_SIZE(repunicode);
3362 if (charmaptranslate_makespace(&res, &str, &ressize,
3363 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3364 Py_DECREF(repunicode);
3365 goto onError;
3366 }
3367 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3368 *str++ = *uni2;
3369 p = startp + newpos;
3370 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371 }
3372 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003373 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003374 /* Resize if we allocated to much */
3375 respos = str-PyUnicode_AS_UNICODE(res);
3376 if (respos<ressize) {
3377 if (_PyUnicode_Resize(&res, respos))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003378 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003379 }
3380 Py_XDECREF(exc);
3381 Py_XDECREF(errorHandler);
3382 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003383
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003384 onError:
3385 Py_XDECREF(res);
3386 Py_XDECREF(exc);
3387 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003388 return NULL;
3389}
3390
3391PyObject *PyUnicode_Translate(PyObject *str,
3392 PyObject *mapping,
3393 const char *errors)
3394{
3395 PyObject *result;
3396
3397 str = PyUnicode_FromObject(str);
3398 if (str == NULL)
3399 goto onError;
3400 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3401 PyUnicode_GET_SIZE(str),
3402 mapping,
3403 errors);
3404 Py_DECREF(str);
3405 return result;
3406
3407 onError:
3408 Py_XDECREF(str);
3409 return NULL;
3410}
3411
Guido van Rossum9e896b32000-04-05 20:11:21 +00003412/* --- Decimal Encoder ---------------------------------------------------- */
3413
3414int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3415 int length,
3416 char *output,
3417 const char *errors)
3418{
3419 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003420 PyObject *errorHandler = NULL;
3421 PyObject *exc = NULL;
3422 const char *encoding = "decimal";
3423 const char *reason = "invalid decimal Unicode string";
3424 /* the following variable is used for caching string comparisons
3425 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3426 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003427
3428 if (output == NULL) {
3429 PyErr_BadArgument();
3430 return -1;
3431 }
3432
3433 p = s;
3434 end = s + length;
3435 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003436 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003437 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003438 PyObject *repunicode;
3439 int repsize;
3440 int newpos;
3441 Py_UNICODE *uni2;
3442 Py_UNICODE *collstart;
3443 Py_UNICODE *collend;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003444
3445 if (Py_UNICODE_ISSPACE(ch)) {
3446 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003447 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003448 continue;
3449 }
3450 decimal = Py_UNICODE_TODECIMAL(ch);
3451 if (decimal >= 0) {
3452 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003453 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003454 continue;
3455 }
Guido van Rossumba477042000-04-06 18:18:10 +00003456 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003457 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003458 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003459 continue;
3460 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003461 /* All other characters are considered unencodable */
3462 collstart = p;
3463 collend = p+1;
3464 while (collend < end) {
3465 if ((0 < *collend && *collend < 256) ||
3466 !Py_UNICODE_ISSPACE(*collend) ||
3467 Py_UNICODE_TODECIMAL(*collend))
3468 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003469 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003470 /* cache callback name lookup
3471 * (if not done yet, i.e. it's the first error) */
3472 if (known_errorHandler==-1) {
3473 if ((errors==NULL) || (!strcmp(errors, "strict")))
3474 known_errorHandler = 1;
3475 else if (!strcmp(errors, "replace"))
3476 known_errorHandler = 2;
3477 else if (!strcmp(errors, "ignore"))
3478 known_errorHandler = 3;
3479 else if (!strcmp(errors, "xmlcharrefreplace"))
3480 known_errorHandler = 4;
3481 else
3482 known_errorHandler = 0;
3483 }
3484 switch (known_errorHandler) {
3485 case 1: /* strict */
3486 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3487 goto onError;
3488 case 2: /* replace */
3489 for (p = collstart; p < collend; ++p)
3490 *output++ = '?';
3491 /* fall through */
3492 case 3: /* ignore */
3493 p = collend;
3494 break;
3495 case 4: /* xmlcharrefreplace */
3496 /* generate replacement (temporarily (mis)uses p) */
3497 for (p = collstart; p < collend; ++p)
3498 output += sprintf(output, "&#%d;", (int)*p);
3499 p = collend;
3500 break;
3501 default:
3502 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3503 encoding, reason, s, length, &exc,
3504 collstart-s, collend-s, &newpos);
3505 if (repunicode == NULL)
3506 goto onError;
3507 /* generate replacement */
3508 repsize = PyUnicode_GET_SIZE(repunicode);
3509 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3510 Py_UNICODE ch = *uni2;
3511 if (Py_UNICODE_ISSPACE(ch))
3512 *output++ = ' ';
3513 else {
3514 decimal = Py_UNICODE_TODECIMAL(ch);
3515 if (decimal >= 0)
3516 *output++ = '0' + decimal;
3517 else if (0 < ch && ch < 256)
3518 *output++ = (char)ch;
3519 else {
3520 Py_DECREF(repunicode);
3521 raise_encode_exception(&exc, encoding,
3522 s, length, collstart-s, collend-s, reason);
3523 goto onError;
3524 }
3525 }
3526 }
3527 p = s + newpos;
3528 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003529 }
3530 }
3531 /* 0-terminate the output string */
3532 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 Py_XDECREF(exc);
3534 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003535 return 0;
3536
3537 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538 Py_XDECREF(exc);
3539 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003540 return -1;
3541}
3542
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543/* --- Helpers ------------------------------------------------------------ */
3544
3545static
3546int count(PyUnicodeObject *self,
3547 int start,
3548 int end,
3549 PyUnicodeObject *substring)
3550{
3551 int count = 0;
3552
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003553 if (start < 0)
3554 start += self->length;
3555 if (start < 0)
3556 start = 0;
3557 if (end > self->length)
3558 end = self->length;
3559 if (end < 0)
3560 end += self->length;
3561 if (end < 0)
3562 end = 0;
3563
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003564 if (substring->length == 0)
3565 return (end - start + 1);
3566
Guido van Rossumd57fd912000-03-10 22:53:23 +00003567 end -= substring->length;
3568
3569 while (start <= end)
3570 if (Py_UNICODE_MATCH(self, start, substring)) {
3571 count++;
3572 start += substring->length;
3573 } else
3574 start++;
3575
3576 return count;
3577}
3578
3579int PyUnicode_Count(PyObject *str,
3580 PyObject *substr,
3581 int start,
3582 int end)
3583{
3584 int result;
3585
3586 str = PyUnicode_FromObject(str);
3587 if (str == NULL)
3588 return -1;
3589 substr = PyUnicode_FromObject(substr);
3590 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003591 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592 return -1;
3593 }
3594
3595 result = count((PyUnicodeObject *)str,
3596 start, end,
3597 (PyUnicodeObject *)substr);
3598
3599 Py_DECREF(str);
3600 Py_DECREF(substr);
3601 return result;
3602}
3603
3604static
3605int findstring(PyUnicodeObject *self,
3606 PyUnicodeObject *substring,
3607 int start,
3608 int end,
3609 int direction)
3610{
3611 if (start < 0)
3612 start += self->length;
3613 if (start < 0)
3614 start = 0;
3615
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616 if (end > self->length)
3617 end = self->length;
3618 if (end < 0)
3619 end += self->length;
3620 if (end < 0)
3621 end = 0;
3622
Guido van Rossum76afbd92002-08-20 17:29:29 +00003623 if (substring->length == 0)
3624 return (direction > 0) ? start : end;
3625
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626 end -= substring->length;
3627
3628 if (direction < 0) {
3629 for (; end >= start; end--)
3630 if (Py_UNICODE_MATCH(self, end, substring))
3631 return end;
3632 } else {
3633 for (; start <= end; start++)
3634 if (Py_UNICODE_MATCH(self, start, substring))
3635 return start;
3636 }
3637
3638 return -1;
3639}
3640
3641int PyUnicode_Find(PyObject *str,
3642 PyObject *substr,
3643 int start,
3644 int end,
3645 int direction)
3646{
3647 int result;
3648
3649 str = PyUnicode_FromObject(str);
3650 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003651 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 substr = PyUnicode_FromObject(substr);
3653 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003654 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003655 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 }
3657
3658 result = findstring((PyUnicodeObject *)str,
3659 (PyUnicodeObject *)substr,
3660 start, end, direction);
3661 Py_DECREF(str);
3662 Py_DECREF(substr);
3663 return result;
3664}
3665
3666static
3667int tailmatch(PyUnicodeObject *self,
3668 PyUnicodeObject *substring,
3669 int start,
3670 int end,
3671 int direction)
3672{
3673 if (start < 0)
3674 start += self->length;
3675 if (start < 0)
3676 start = 0;
3677
3678 if (substring->length == 0)
3679 return 1;
3680
3681 if (end > self->length)
3682 end = self->length;
3683 if (end < 0)
3684 end += self->length;
3685 if (end < 0)
3686 end = 0;
3687
3688 end -= substring->length;
3689 if (end < start)
3690 return 0;
3691
3692 if (direction > 0) {
3693 if (Py_UNICODE_MATCH(self, end, substring))
3694 return 1;
3695 } else {
3696 if (Py_UNICODE_MATCH(self, start, substring))
3697 return 1;
3698 }
3699
3700 return 0;
3701}
3702
3703int PyUnicode_Tailmatch(PyObject *str,
3704 PyObject *substr,
3705 int start,
3706 int end,
3707 int direction)
3708{
3709 int result;
3710
3711 str = PyUnicode_FromObject(str);
3712 if (str == NULL)
3713 return -1;
3714 substr = PyUnicode_FromObject(substr);
3715 if (substr == NULL) {
3716 Py_DECREF(substr);
3717 return -1;
3718 }
3719
3720 result = tailmatch((PyUnicodeObject *)str,
3721 (PyUnicodeObject *)substr,
3722 start, end, direction);
3723 Py_DECREF(str);
3724 Py_DECREF(substr);
3725 return result;
3726}
3727
3728static
3729const Py_UNICODE *findchar(const Py_UNICODE *s,
3730 int size,
3731 Py_UNICODE ch)
3732{
3733 /* like wcschr, but doesn't stop at NULL characters */
3734
3735 while (size-- > 0) {
3736 if (*s == ch)
3737 return s;
3738 s++;
3739 }
3740
3741 return NULL;
3742}
3743
3744/* Apply fixfct filter to the Unicode object self and return a
3745 reference to the modified object */
3746
3747static
3748PyObject *fixup(PyUnicodeObject *self,
3749 int (*fixfct)(PyUnicodeObject *s))
3750{
3751
3752 PyUnicodeObject *u;
3753
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003754 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755 if (u == NULL)
3756 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003757
3758 Py_UNICODE_COPY(u->str, self->str, self->length);
3759
Tim Peters7a29bd52001-09-12 03:03:31 +00003760 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761 /* fixfct should return TRUE if it modified the buffer. If
3762 FALSE, return a reference to the original buffer instead
3763 (to save space, not time) */
3764 Py_INCREF(self);
3765 Py_DECREF(u);
3766 return (PyObject*) self;
3767 }
3768 return (PyObject*) u;
3769}
3770
3771static
3772int fixupper(PyUnicodeObject *self)
3773{
3774 int len = self->length;
3775 Py_UNICODE *s = self->str;
3776 int status = 0;
3777
3778 while (len-- > 0) {
3779 register Py_UNICODE ch;
3780
3781 ch = Py_UNICODE_TOUPPER(*s);
3782 if (ch != *s) {
3783 status = 1;
3784 *s = ch;
3785 }
3786 s++;
3787 }
3788
3789 return status;
3790}
3791
3792static
3793int fixlower(PyUnicodeObject *self)
3794{
3795 int len = self->length;
3796 Py_UNICODE *s = self->str;
3797 int status = 0;
3798
3799 while (len-- > 0) {
3800 register Py_UNICODE ch;
3801
3802 ch = Py_UNICODE_TOLOWER(*s);
3803 if (ch != *s) {
3804 status = 1;
3805 *s = ch;
3806 }
3807 s++;
3808 }
3809
3810 return status;
3811}
3812
3813static
3814int fixswapcase(PyUnicodeObject *self)
3815{
3816 int len = self->length;
3817 Py_UNICODE *s = self->str;
3818 int status = 0;
3819
3820 while (len-- > 0) {
3821 if (Py_UNICODE_ISUPPER(*s)) {
3822 *s = Py_UNICODE_TOLOWER(*s);
3823 status = 1;
3824 } else if (Py_UNICODE_ISLOWER(*s)) {
3825 *s = Py_UNICODE_TOUPPER(*s);
3826 status = 1;
3827 }
3828 s++;
3829 }
3830
3831 return status;
3832}
3833
3834static
3835int fixcapitalize(PyUnicodeObject *self)
3836{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003837 int len = self->length;
3838 Py_UNICODE *s = self->str;
3839 int status = 0;
3840
3841 if (len == 0)
3842 return 0;
3843 if (Py_UNICODE_ISLOWER(*s)) {
3844 *s = Py_UNICODE_TOUPPER(*s);
3845 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003847 s++;
3848 while (--len > 0) {
3849 if (Py_UNICODE_ISUPPER(*s)) {
3850 *s = Py_UNICODE_TOLOWER(*s);
3851 status = 1;
3852 }
3853 s++;
3854 }
3855 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003856}
3857
3858static
3859int fixtitle(PyUnicodeObject *self)
3860{
3861 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3862 register Py_UNICODE *e;
3863 int previous_is_cased;
3864
3865 /* Shortcut for single character strings */
3866 if (PyUnicode_GET_SIZE(self) == 1) {
3867 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3868 if (*p != ch) {
3869 *p = ch;
3870 return 1;
3871 }
3872 else
3873 return 0;
3874 }
3875
3876 e = p + PyUnicode_GET_SIZE(self);
3877 previous_is_cased = 0;
3878 for (; p < e; p++) {
3879 register const Py_UNICODE ch = *p;
3880
3881 if (previous_is_cased)
3882 *p = Py_UNICODE_TOLOWER(ch);
3883 else
3884 *p = Py_UNICODE_TOTITLE(ch);
3885
3886 if (Py_UNICODE_ISLOWER(ch) ||
3887 Py_UNICODE_ISUPPER(ch) ||
3888 Py_UNICODE_ISTITLE(ch))
3889 previous_is_cased = 1;
3890 else
3891 previous_is_cased = 0;
3892 }
3893 return 1;
3894}
3895
3896PyObject *PyUnicode_Join(PyObject *separator,
3897 PyObject *seq)
3898{
3899 Py_UNICODE *sep;
3900 int seplen;
3901 PyUnicodeObject *res = NULL;
3902 int reslen = 0;
3903 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003904 int sz = 100;
3905 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003906 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003907
Tim Peters2cfe3682001-05-05 05:36:48 +00003908 it = PyObject_GetIter(seq);
3909 if (it == NULL)
3910 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003911
3912 if (separator == NULL) {
3913 Py_UNICODE blank = ' ';
3914 sep = &blank;
3915 seplen = 1;
3916 }
3917 else {
3918 separator = PyUnicode_FromObject(separator);
3919 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003920 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921 sep = PyUnicode_AS_UNICODE(separator);
3922 seplen = PyUnicode_GET_SIZE(separator);
3923 }
3924
3925 res = _PyUnicode_New(sz);
3926 if (res == NULL)
3927 goto onError;
3928 p = PyUnicode_AS_UNICODE(res);
3929 reslen = 0;
3930
Tim Peters2cfe3682001-05-05 05:36:48 +00003931 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003932 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003933 PyObject *item = PyIter_Next(it);
3934 if (item == NULL) {
3935 if (PyErr_Occurred())
3936 goto onError;
3937 break;
3938 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003939 if (!PyUnicode_Check(item)) {
3940 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003941 if (!PyString_Check(item)) {
3942 PyErr_Format(PyExc_TypeError,
3943 "sequence item %i: expected string or Unicode,"
3944 " %.80s found",
3945 i, item->ob_type->tp_name);
3946 Py_DECREF(item);
3947 goto onError;
3948 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003949 v = PyUnicode_FromObject(item);
3950 Py_DECREF(item);
3951 item = v;
3952 if (item == NULL)
3953 goto onError;
3954 }
3955 itemlen = PyUnicode_GET_SIZE(item);
3956 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003957 if (_PyUnicode_Resize(&res, sz*2)) {
3958 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003960 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 sz *= 2;
3962 p = PyUnicode_AS_UNICODE(res) + reslen;
3963 }
3964 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003965 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966 p += seplen;
3967 reslen += seplen;
3968 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003969 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970 p += itemlen;
3971 reslen += itemlen;
3972 Py_DECREF(item);
3973 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003974 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 goto onError;
3976
3977 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003978 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003979 return (PyObject *)res;
3980
3981 onError:
3982 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003983 Py_XDECREF(res);
3984 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985 return NULL;
3986}
3987
3988static
3989PyUnicodeObject *pad(PyUnicodeObject *self,
3990 int left,
3991 int right,
3992 Py_UNICODE fill)
3993{
3994 PyUnicodeObject *u;
3995
3996 if (left < 0)
3997 left = 0;
3998 if (right < 0)
3999 right = 0;
4000
Tim Peters7a29bd52001-09-12 03:03:31 +00004001 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004002 Py_INCREF(self);
4003 return self;
4004 }
4005
4006 u = _PyUnicode_New(left + self->length + right);
4007 if (u) {
4008 if (left)
4009 Py_UNICODE_FILL(u->str, fill, left);
4010 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4011 if (right)
4012 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4013 }
4014
4015 return u;
4016}
4017
4018#define SPLIT_APPEND(data, left, right) \
4019 str = PyUnicode_FromUnicode(data + left, right - left); \
4020 if (!str) \
4021 goto onError; \
4022 if (PyList_Append(list, str)) { \
4023 Py_DECREF(str); \
4024 goto onError; \
4025 } \
4026 else \
4027 Py_DECREF(str);
4028
4029static
4030PyObject *split_whitespace(PyUnicodeObject *self,
4031 PyObject *list,
4032 int maxcount)
4033{
4034 register int i;
4035 register int j;
4036 int len = self->length;
4037 PyObject *str;
4038
4039 for (i = j = 0; i < len; ) {
4040 /* find a token */
4041 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4042 i++;
4043 j = i;
4044 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4045 i++;
4046 if (j < i) {
4047 if (maxcount-- <= 0)
4048 break;
4049 SPLIT_APPEND(self->str, j, i);
4050 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4051 i++;
4052 j = i;
4053 }
4054 }
4055 if (j < len) {
4056 SPLIT_APPEND(self->str, j, len);
4057 }
4058 return list;
4059
4060 onError:
4061 Py_DECREF(list);
4062 return NULL;
4063}
4064
4065PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004066 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067{
4068 register int i;
4069 register int j;
4070 int len;
4071 PyObject *list;
4072 PyObject *str;
4073 Py_UNICODE *data;
4074
4075 string = PyUnicode_FromObject(string);
4076 if (string == NULL)
4077 return NULL;
4078 data = PyUnicode_AS_UNICODE(string);
4079 len = PyUnicode_GET_SIZE(string);
4080
Guido van Rossumd57fd912000-03-10 22:53:23 +00004081 list = PyList_New(0);
4082 if (!list)
4083 goto onError;
4084
4085 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004086 int eol;
4087
Guido van Rossumd57fd912000-03-10 22:53:23 +00004088 /* Find a line and append it */
4089 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4090 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091
4092 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004093 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094 if (i < len) {
4095 if (data[i] == '\r' && i + 1 < len &&
4096 data[i+1] == '\n')
4097 i += 2;
4098 else
4099 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004100 if (keepends)
4101 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102 }
Guido van Rossum86662912000-04-11 15:38:46 +00004103 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104 j = i;
4105 }
4106 if (j < len) {
4107 SPLIT_APPEND(data, j, len);
4108 }
4109
4110 Py_DECREF(string);
4111 return list;
4112
4113 onError:
4114 Py_DECREF(list);
4115 Py_DECREF(string);
4116 return NULL;
4117}
4118
4119static
4120PyObject *split_char(PyUnicodeObject *self,
4121 PyObject *list,
4122 Py_UNICODE ch,
4123 int maxcount)
4124{
4125 register int i;
4126 register int j;
4127 int len = self->length;
4128 PyObject *str;
4129
4130 for (i = j = 0; i < len; ) {
4131 if (self->str[i] == ch) {
4132 if (maxcount-- <= 0)
4133 break;
4134 SPLIT_APPEND(self->str, j, i);
4135 i = j = i + 1;
4136 } else
4137 i++;
4138 }
4139 if (j <= len) {
4140 SPLIT_APPEND(self->str, j, len);
4141 }
4142 return list;
4143
4144 onError:
4145 Py_DECREF(list);
4146 return NULL;
4147}
4148
4149static
4150PyObject *split_substring(PyUnicodeObject *self,
4151 PyObject *list,
4152 PyUnicodeObject *substring,
4153 int maxcount)
4154{
4155 register int i;
4156 register int j;
4157 int len = self->length;
4158 int sublen = substring->length;
4159 PyObject *str;
4160
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004161 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162 if (Py_UNICODE_MATCH(self, i, substring)) {
4163 if (maxcount-- <= 0)
4164 break;
4165 SPLIT_APPEND(self->str, j, i);
4166 i = j = i + sublen;
4167 } else
4168 i++;
4169 }
4170 if (j <= len) {
4171 SPLIT_APPEND(self->str, j, len);
4172 }
4173 return list;
4174
4175 onError:
4176 Py_DECREF(list);
4177 return NULL;
4178}
4179
4180#undef SPLIT_APPEND
4181
4182static
4183PyObject *split(PyUnicodeObject *self,
4184 PyUnicodeObject *substring,
4185 int maxcount)
4186{
4187 PyObject *list;
4188
4189 if (maxcount < 0)
4190 maxcount = INT_MAX;
4191
4192 list = PyList_New(0);
4193 if (!list)
4194 return NULL;
4195
4196 if (substring == NULL)
4197 return split_whitespace(self,list,maxcount);
4198
4199 else if (substring->length == 1)
4200 return split_char(self,list,substring->str[0],maxcount);
4201
4202 else if (substring->length == 0) {
4203 Py_DECREF(list);
4204 PyErr_SetString(PyExc_ValueError, "empty separator");
4205 return NULL;
4206 }
4207 else
4208 return split_substring(self,list,substring,maxcount);
4209}
4210
4211static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004212PyObject *replace(PyUnicodeObject *self,
4213 PyUnicodeObject *str1,
4214 PyUnicodeObject *str2,
4215 int maxcount)
4216{
4217 PyUnicodeObject *u;
4218
4219 if (maxcount < 0)
4220 maxcount = INT_MAX;
4221
4222 if (str1->length == 1 && str2->length == 1) {
4223 int i;
4224
4225 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004226 if (!findchar(self->str, self->length, str1->str[0]) &&
4227 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228 /* nothing to replace, return original string */
4229 Py_INCREF(self);
4230 u = self;
4231 } else {
4232 Py_UNICODE u1 = str1->str[0];
4233 Py_UNICODE u2 = str2->str[0];
4234
4235 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004236 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004237 self->length
4238 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004239 if (u != NULL) {
4240 Py_UNICODE_COPY(u->str, self->str,
4241 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004242 for (i = 0; i < u->length; i++)
4243 if (u->str[i] == u1) {
4244 if (--maxcount < 0)
4245 break;
4246 u->str[i] = u2;
4247 }
4248 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004249 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004250
4251 } else {
4252 int n, i;
4253 Py_UNICODE *p;
4254
4255 /* replace strings */
4256 n = count(self, 0, self->length, str1);
4257 if (n > maxcount)
4258 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004259 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004261 if (PyUnicode_CheckExact(self)) {
4262 Py_INCREF(self);
4263 u = self;
4264 }
4265 else {
4266 u = (PyUnicodeObject *)
4267 PyUnicode_FromUnicode(self->str, self->length);
4268 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004269 } else {
4270 u = _PyUnicode_New(
4271 self->length + n * (str2->length - str1->length));
4272 if (u) {
4273 i = 0;
4274 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004275 if (str1->length > 0) {
4276 while (i <= self->length - str1->length)
4277 if (Py_UNICODE_MATCH(self, i, str1)) {
4278 /* replace string segment */
4279 Py_UNICODE_COPY(p, str2->str, str2->length);
4280 p += str2->length;
4281 i += str1->length;
4282 if (--n <= 0) {
4283 /* copy remaining part */
4284 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4285 break;
4286 }
4287 } else
4288 *p++ = self->str[i++];
4289 } else {
4290 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004291 Py_UNICODE_COPY(p, str2->str, str2->length);
4292 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004293 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004294 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004295 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004296 }
4297 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4298 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299 }
4300 }
4301 }
4302
4303 return (PyObject *) u;
4304}
4305
4306/* --- Unicode Object Methods --------------------------------------------- */
4307
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004308PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309"S.title() -> unicode\n\
4310\n\
4311Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004312characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313
4314static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004315unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004316{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004317 return fixup(self, fixtitle);
4318}
4319
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004320PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321"S.capitalize() -> unicode\n\
4322\n\
4323Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004324have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325
4326static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004327unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004329 return fixup(self, fixcapitalize);
4330}
4331
4332#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004333PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334"S.capwords() -> unicode\n\
4335\n\
4336Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004337normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338
4339static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004340unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341{
4342 PyObject *list;
4343 PyObject *item;
4344 int i;
4345
Guido van Rossumd57fd912000-03-10 22:53:23 +00004346 /* Split into words */
4347 list = split(self, NULL, -1);
4348 if (!list)
4349 return NULL;
4350
4351 /* Capitalize each word */
4352 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4353 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4354 fixcapitalize);
4355 if (item == NULL)
4356 goto onError;
4357 Py_DECREF(PyList_GET_ITEM(list, i));
4358 PyList_SET_ITEM(list, i, item);
4359 }
4360
4361 /* Join the words to form a new string */
4362 item = PyUnicode_Join(NULL, list);
4363
4364onError:
4365 Py_DECREF(list);
4366 return (PyObject *)item;
4367}
4368#endif
4369
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004370PyDoc_STRVAR(center__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004371"S.center(width) -> unicode\n\
4372\n\
4373Return S centered in a Unicode string of length width. Padding is done\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004374using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004375
4376static PyObject *
4377unicode_center(PyUnicodeObject *self, PyObject *args)
4378{
4379 int marg, left;
4380 int width;
4381
4382 if (!PyArg_ParseTuple(args, "i:center", &width))
4383 return NULL;
4384
Tim Peters7a29bd52001-09-12 03:03:31 +00004385 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386 Py_INCREF(self);
4387 return (PyObject*) self;
4388 }
4389
4390 marg = width - self->length;
4391 left = marg / 2 + (marg & width & 1);
4392
4393 return (PyObject*) pad(self, left, marg - left, ' ');
4394}
4395
Marc-André Lemburge5034372000-08-08 08:04:29 +00004396#if 0
4397
4398/* This code should go into some future Unicode collation support
4399 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004400 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004401
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004402/* speedy UTF-16 code point order comparison */
4403/* gleaned from: */
4404/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4405
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004406static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004407{
4408 0, 0, 0, 0, 0, 0, 0, 0,
4409 0, 0, 0, 0, 0, 0, 0, 0,
4410 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004411 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004412};
4413
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414static int
4415unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4416{
4417 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004418
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419 Py_UNICODE *s1 = str1->str;
4420 Py_UNICODE *s2 = str2->str;
4421
4422 len1 = str1->length;
4423 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004424
Guido van Rossumd57fd912000-03-10 22:53:23 +00004425 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004426 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004427
4428 c1 = *s1++;
4429 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004430
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004431 if (c1 > (1<<11) * 26)
4432 c1 += utf16Fixup[c1>>11];
4433 if (c2 > (1<<11) * 26)
4434 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004435 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004436
4437 if (c1 != c2)
4438 return (c1 < c2) ? -1 : 1;
4439
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004440 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441 }
4442
4443 return (len1 < len2) ? -1 : (len1 != len2);
4444}
4445
Marc-André Lemburge5034372000-08-08 08:04:29 +00004446#else
4447
4448static int
4449unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4450{
4451 register int len1, len2;
4452
4453 Py_UNICODE *s1 = str1->str;
4454 Py_UNICODE *s2 = str2->str;
4455
4456 len1 = str1->length;
4457 len2 = str2->length;
4458
4459 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00004460 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004461
Fredrik Lundh45714e92001-06-26 16:39:36 +00004462 c1 = *s1++;
4463 c2 = *s2++;
4464
4465 if (c1 != c2)
4466 return (c1 < c2) ? -1 : 1;
4467
Marc-André Lemburge5034372000-08-08 08:04:29 +00004468 len1--; len2--;
4469 }
4470
4471 return (len1 < len2) ? -1 : (len1 != len2);
4472}
4473
4474#endif
4475
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476int PyUnicode_Compare(PyObject *left,
4477 PyObject *right)
4478{
4479 PyUnicodeObject *u = NULL, *v = NULL;
4480 int result;
4481
4482 /* Coerce the two arguments */
4483 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4484 if (u == NULL)
4485 goto onError;
4486 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4487 if (v == NULL)
4488 goto onError;
4489
Thomas Wouters7e474022000-07-16 12:04:32 +00004490 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491 if (v == u) {
4492 Py_DECREF(u);
4493 Py_DECREF(v);
4494 return 0;
4495 }
4496
4497 result = unicode_compare(u, v);
4498
4499 Py_DECREF(u);
4500 Py_DECREF(v);
4501 return result;
4502
4503onError:
4504 Py_XDECREF(u);
4505 Py_XDECREF(v);
4506 return -1;
4507}
4508
Guido van Rossum403d68b2000-03-13 15:55:09 +00004509int PyUnicode_Contains(PyObject *container,
4510 PyObject *element)
4511{
4512 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004513 int result, size;
4514 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004515
4516 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004517 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004518 if (v == NULL) {
4519 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004520 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004521 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004522 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004523 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004524 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004525 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004526
Barry Warsaw817918c2002-08-06 16:58:21 +00004527 size = PyUnicode_GET_SIZE(v);
4528 rhs = PyUnicode_AS_UNICODE(v);
4529 lhs = PyUnicode_AS_UNICODE(u);
4530
Guido van Rossum403d68b2000-03-13 15:55:09 +00004531 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004532 if (size == 1) {
4533 end = lhs + PyUnicode_GET_SIZE(u);
4534 while (lhs < end) {
4535 if (*lhs++ == *rhs) {
4536 result = 1;
4537 break;
4538 }
4539 }
4540 }
4541 else {
4542 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4543 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004544 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004545 result = 1;
4546 break;
4547 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004548 }
4549 }
4550
4551 Py_DECREF(u);
4552 Py_DECREF(v);
4553 return result;
4554
4555onError:
4556 Py_XDECREF(u);
4557 Py_XDECREF(v);
4558 return -1;
4559}
4560
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561/* Concat to string or Unicode object giving a new Unicode object. */
4562
4563PyObject *PyUnicode_Concat(PyObject *left,
4564 PyObject *right)
4565{
4566 PyUnicodeObject *u = NULL, *v = NULL, *w;
4567
4568 /* Coerce the two arguments */
4569 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4570 if (u == NULL)
4571 goto onError;
4572 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4573 if (v == NULL)
4574 goto onError;
4575
4576 /* Shortcuts */
4577 if (v == unicode_empty) {
4578 Py_DECREF(v);
4579 return (PyObject *)u;
4580 }
4581 if (u == unicode_empty) {
4582 Py_DECREF(u);
4583 return (PyObject *)v;
4584 }
4585
4586 /* Concat the two Unicode strings */
4587 w = _PyUnicode_New(u->length + v->length);
4588 if (w == NULL)
4589 goto onError;
4590 Py_UNICODE_COPY(w->str, u->str, u->length);
4591 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4592
4593 Py_DECREF(u);
4594 Py_DECREF(v);
4595 return (PyObject *)w;
4596
4597onError:
4598 Py_XDECREF(u);
4599 Py_XDECREF(v);
4600 return NULL;
4601}
4602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004603PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604"S.count(sub[, start[, end]]) -> int\n\
4605\n\
4606Return the number of occurrences of substring sub in Unicode string\n\
4607S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004608interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004609
4610static PyObject *
4611unicode_count(PyUnicodeObject *self, PyObject *args)
4612{
4613 PyUnicodeObject *substring;
4614 int start = 0;
4615 int end = INT_MAX;
4616 PyObject *result;
4617
Guido van Rossumb8872e62000-05-09 14:14:27 +00004618 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4619 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004620 return NULL;
4621
4622 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4623 (PyObject *)substring);
4624 if (substring == NULL)
4625 return NULL;
4626
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627 if (start < 0)
4628 start += self->length;
4629 if (start < 0)
4630 start = 0;
4631 if (end > self->length)
4632 end = self->length;
4633 if (end < 0)
4634 end += self->length;
4635 if (end < 0)
4636 end = 0;
4637
4638 result = PyInt_FromLong((long) count(self, start, end, substring));
4639
4640 Py_DECREF(substring);
4641 return result;
4642}
4643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004644PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645"S.encode([encoding[,errors]]) -> string\n\
4646\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004647Return an encoded string version of S. Default encoding is the current\n\
4648default string encoding. errors may be given to set a different error\n\
4649handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004650a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4651'xmlcharrefreplace' as well as any other name registered with\n\
4652codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653
4654static PyObject *
4655unicode_encode(PyUnicodeObject *self, PyObject *args)
4656{
4657 char *encoding = NULL;
4658 char *errors = NULL;
4659 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4660 return NULL;
4661 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4662}
4663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004664PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004665"S.expandtabs([tabsize]) -> unicode\n\
4666\n\
4667Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004668If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004669
4670static PyObject*
4671unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4672{
4673 Py_UNICODE *e;
4674 Py_UNICODE *p;
4675 Py_UNICODE *q;
4676 int i, j;
4677 PyUnicodeObject *u;
4678 int tabsize = 8;
4679
4680 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4681 return NULL;
4682
Thomas Wouters7e474022000-07-16 12:04:32 +00004683 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684 i = j = 0;
4685 e = self->str + self->length;
4686 for (p = self->str; p < e; p++)
4687 if (*p == '\t') {
4688 if (tabsize > 0)
4689 j += tabsize - (j % tabsize);
4690 }
4691 else {
4692 j++;
4693 if (*p == '\n' || *p == '\r') {
4694 i += j;
4695 j = 0;
4696 }
4697 }
4698
4699 /* Second pass: create output string and fill it */
4700 u = _PyUnicode_New(i + j);
4701 if (!u)
4702 return NULL;
4703
4704 j = 0;
4705 q = u->str;
4706
4707 for (p = self->str; p < e; p++)
4708 if (*p == '\t') {
4709 if (tabsize > 0) {
4710 i = tabsize - (j % tabsize);
4711 j += i;
4712 while (i--)
4713 *q++ = ' ';
4714 }
4715 }
4716 else {
4717 j++;
4718 *q++ = *p;
4719 if (*p == '\n' || *p == '\r')
4720 j = 0;
4721 }
4722
4723 return (PyObject*) u;
4724}
4725
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004726PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004727"S.find(sub [,start [,end]]) -> int\n\
4728\n\
4729Return the lowest index in S where substring sub is found,\n\
4730such that sub is contained within s[start,end]. Optional\n\
4731arguments start and end are interpreted as in slice notation.\n\
4732\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004733Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734
4735static PyObject *
4736unicode_find(PyUnicodeObject *self, PyObject *args)
4737{
4738 PyUnicodeObject *substring;
4739 int start = 0;
4740 int end = INT_MAX;
4741 PyObject *result;
4742
Guido van Rossumb8872e62000-05-09 14:14:27 +00004743 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4744 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745 return NULL;
4746 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4747 (PyObject *)substring);
4748 if (substring == NULL)
4749 return NULL;
4750
4751 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4752
4753 Py_DECREF(substring);
4754 return result;
4755}
4756
4757static PyObject *
4758unicode_getitem(PyUnicodeObject *self, int index)
4759{
4760 if (index < 0 || index >= self->length) {
4761 PyErr_SetString(PyExc_IndexError, "string index out of range");
4762 return NULL;
4763 }
4764
4765 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4766}
4767
4768static long
4769unicode_hash(PyUnicodeObject *self)
4770{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004771 /* Since Unicode objects compare equal to their ASCII string
4772 counterparts, they should use the individual character values
4773 as basis for their hash value. This is needed to assure that
4774 strings and Unicode objects behave in the same way as
4775 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776
Fredrik Lundhdde61642000-07-10 18:27:47 +00004777 register int len;
4778 register Py_UNICODE *p;
4779 register long x;
4780
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781 if (self->hash != -1)
4782 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004783 len = PyUnicode_GET_SIZE(self);
4784 p = PyUnicode_AS_UNICODE(self);
4785 x = *p << 7;
4786 while (--len >= 0)
4787 x = (1000003*x) ^ *p++;
4788 x ^= PyUnicode_GET_SIZE(self);
4789 if (x == -1)
4790 x = -2;
4791 self->hash = x;
4792 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793}
4794
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004795PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796"S.index(sub [,start [,end]]) -> int\n\
4797\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004798Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799
4800static PyObject *
4801unicode_index(PyUnicodeObject *self, PyObject *args)
4802{
4803 int result;
4804 PyUnicodeObject *substring;
4805 int start = 0;
4806 int end = INT_MAX;
4807
Guido van Rossumb8872e62000-05-09 14:14:27 +00004808 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4809 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810 return NULL;
4811
4812 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4813 (PyObject *)substring);
4814 if (substring == NULL)
4815 return NULL;
4816
4817 result = findstring(self, substring, start, end, 1);
4818
4819 Py_DECREF(substring);
4820 if (result < 0) {
4821 PyErr_SetString(PyExc_ValueError, "substring not found");
4822 return NULL;
4823 }
4824 return PyInt_FromLong(result);
4825}
4826
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004827PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004828"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004830Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004831at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832
4833static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004834unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835{
4836 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4837 register const Py_UNICODE *e;
4838 int cased;
4839
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 /* Shortcut for single character strings */
4841 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004842 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004844 /* Special case for empty strings */
4845 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004846 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004847
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 e = p + PyUnicode_GET_SIZE(self);
4849 cased = 0;
4850 for (; p < e; p++) {
4851 register const Py_UNICODE ch = *p;
4852
4853 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004854 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 else if (!cased && Py_UNICODE_ISLOWER(ch))
4856 cased = 1;
4857 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004858 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859}
4860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004861PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004862"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004864Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004865at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866
4867static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004868unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869{
4870 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4871 register const Py_UNICODE *e;
4872 int cased;
4873
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874 /* Shortcut for single character strings */
4875 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004876 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004878 /* Special case for empty strings */
4879 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004880 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004881
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882 e = p + PyUnicode_GET_SIZE(self);
4883 cased = 0;
4884 for (; p < e; p++) {
4885 register const Py_UNICODE ch = *p;
4886
4887 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004888 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889 else if (!cased && Py_UNICODE_ISUPPER(ch))
4890 cased = 1;
4891 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004892 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893}
4894
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004895PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004896"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004898Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4899characters may only follow uncased characters and lowercase characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004900only cased ones. Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901
4902static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004903unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904{
4905 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4906 register const Py_UNICODE *e;
4907 int cased, previous_is_cased;
4908
Guido van Rossumd57fd912000-03-10 22:53:23 +00004909 /* Shortcut for single character strings */
4910 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004911 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4912 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004914 /* Special case for empty strings */
4915 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004916 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004917
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918 e = p + PyUnicode_GET_SIZE(self);
4919 cased = 0;
4920 previous_is_cased = 0;
4921 for (; p < e; p++) {
4922 register const Py_UNICODE ch = *p;
4923
4924 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4925 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004926 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927 previous_is_cased = 1;
4928 cased = 1;
4929 }
4930 else if (Py_UNICODE_ISLOWER(ch)) {
4931 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004932 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933 previous_is_cased = 1;
4934 cased = 1;
4935 }
4936 else
4937 previous_is_cased = 0;
4938 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004939 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940}
4941
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004942PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004943"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004945Return True if there are only whitespace characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004946False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947
4948static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004949unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950{
4951 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4952 register const Py_UNICODE *e;
4953
Guido van Rossumd57fd912000-03-10 22:53:23 +00004954 /* Shortcut for single character strings */
4955 if (PyUnicode_GET_SIZE(self) == 1 &&
4956 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004957 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004958
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004959 /* Special case for empty strings */
4960 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004961 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004962
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963 e = p + PyUnicode_GET_SIZE(self);
4964 for (; p < e; p++) {
4965 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004966 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004967 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004968 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004969}
4970
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004971PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004972"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004973\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004974Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004975and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004976
4977static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004978unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004979{
4980 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4981 register const Py_UNICODE *e;
4982
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004983 /* Shortcut for single character strings */
4984 if (PyUnicode_GET_SIZE(self) == 1 &&
4985 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004986 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004987
4988 /* Special case for empty strings */
4989 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004990 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004991
4992 e = p + PyUnicode_GET_SIZE(self);
4993 for (; p < e; p++) {
4994 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004995 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004996 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004997 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004998}
4999
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005000PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005001"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005002\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005003Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005004and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005005
5006static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005007unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005008{
5009 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5010 register const Py_UNICODE *e;
5011
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005012 /* Shortcut for single character strings */
5013 if (PyUnicode_GET_SIZE(self) == 1 &&
5014 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005015 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005016
5017 /* Special case for empty strings */
5018 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005019 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005020
5021 e = p + PyUnicode_GET_SIZE(self);
5022 for (; p < e; p++) {
5023 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005024 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005025 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005026 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005027}
5028
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005029PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005030"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005031\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005032Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005033False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005034
5035static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005036unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037{
5038 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5039 register const Py_UNICODE *e;
5040
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041 /* Shortcut for single character strings */
5042 if (PyUnicode_GET_SIZE(self) == 1 &&
5043 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005044 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005045
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005046 /* Special case for empty strings */
5047 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005048 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005049
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050 e = p + PyUnicode_GET_SIZE(self);
5051 for (; p < e; p++) {
5052 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005053 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005054 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005055 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005056}
5057
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005058PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005059"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005061Return True if there are only digit characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005062False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063
5064static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005065unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066{
5067 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5068 register const Py_UNICODE *e;
5069
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070 /* Shortcut for single character strings */
5071 if (PyUnicode_GET_SIZE(self) == 1 &&
5072 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005073 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005075 /* Special case for empty strings */
5076 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005077 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005078
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079 e = p + PyUnicode_GET_SIZE(self);
5080 for (; p < e; p++) {
5081 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005082 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005084 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085}
5086
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005087PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005088"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005090Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005091False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092
5093static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005094unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095{
5096 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5097 register const Py_UNICODE *e;
5098
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099 /* Shortcut for single character strings */
5100 if (PyUnicode_GET_SIZE(self) == 1 &&
5101 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005102 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005104 /* Special case for empty strings */
5105 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005106 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005107
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108 e = p + PyUnicode_GET_SIZE(self);
5109 for (; p < e; p++) {
5110 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005111 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005113 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114}
5115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005116PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005117"S.join(sequence) -> unicode\n\
5118\n\
5119Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005120sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121
5122static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005123unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005125 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126}
5127
5128static int
5129unicode_length(PyUnicodeObject *self)
5130{
5131 return self->length;
5132}
5133
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005134PyDoc_STRVAR(ljust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135"S.ljust(width) -> unicode\n\
5136\n\
5137Return S left justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005138done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139
5140static PyObject *
5141unicode_ljust(PyUnicodeObject *self, PyObject *args)
5142{
5143 int width;
5144 if (!PyArg_ParseTuple(args, "i:ljust", &width))
5145 return NULL;
5146
Tim Peters7a29bd52001-09-12 03:03:31 +00005147 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148 Py_INCREF(self);
5149 return (PyObject*) self;
5150 }
5151
5152 return (PyObject*) pad(self, 0, width - self->length, ' ');
5153}
5154
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005155PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156"S.lower() -> unicode\n\
5157\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005158Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159
5160static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005161unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163 return fixup(self, fixlower);
5164}
5165
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005166#define LEFTSTRIP 0
5167#define RIGHTSTRIP 1
5168#define BOTHSTRIP 2
5169
5170/* Arrays indexed by above */
5171static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5172
5173#define STRIPNAME(i) (stripformat[i]+3)
5174
5175static const Py_UNICODE *
5176unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5177{
Tim Peters030a5ce2002-04-22 19:00:10 +00005178 size_t i;
5179 for (i = 0; i < n; ++i)
5180 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005181 return s+i;
5182 return NULL;
5183}
5184
5185/* externally visible for str.strip(unicode) */
5186PyObject *
5187_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5188{
5189 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5190 int len = PyUnicode_GET_SIZE(self);
5191 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5192 int seplen = PyUnicode_GET_SIZE(sepobj);
5193 int i, j;
5194
5195 i = 0;
5196 if (striptype != RIGHTSTRIP) {
5197 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5198 i++;
5199 }
5200 }
5201
5202 j = len;
5203 if (striptype != LEFTSTRIP) {
5204 do {
5205 j--;
5206 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5207 j++;
5208 }
5209
5210 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5211 Py_INCREF(self);
5212 return (PyObject*)self;
5213 }
5214 else
5215 return PyUnicode_FromUnicode(s+i, j-i);
5216}
5217
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218
5219static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005220do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005222 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5223 int len = PyUnicode_GET_SIZE(self), i, j;
5224
5225 i = 0;
5226 if (striptype != RIGHTSTRIP) {
5227 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5228 i++;
5229 }
5230 }
5231
5232 j = len;
5233 if (striptype != LEFTSTRIP) {
5234 do {
5235 j--;
5236 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5237 j++;
5238 }
5239
5240 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5241 Py_INCREF(self);
5242 return (PyObject*)self;
5243 }
5244 else
5245 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246}
5247
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005248
5249static PyObject *
5250do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5251{
5252 PyObject *sep = NULL;
5253
5254 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5255 return NULL;
5256
5257 if (sep != NULL && sep != Py_None) {
5258 if (PyUnicode_Check(sep))
5259 return _PyUnicode_XStrip(self, striptype, sep);
5260 else if (PyString_Check(sep)) {
5261 PyObject *res;
5262 sep = PyUnicode_FromObject(sep);
5263 if (sep==NULL)
5264 return NULL;
5265 res = _PyUnicode_XStrip(self, striptype, sep);
5266 Py_DECREF(sep);
5267 return res;
5268 }
5269 else {
5270 PyErr_Format(PyExc_TypeError,
5271 "%s arg must be None, unicode or str",
5272 STRIPNAME(striptype));
5273 return NULL;
5274 }
5275 }
5276
5277 return do_strip(self, striptype);
5278}
5279
5280
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005281PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005282"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005283\n\
5284Return a copy of the string S with leading and trailing\n\
5285whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005286If chars is given and not None, remove characters in chars instead.\n\
5287If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005288
5289static PyObject *
5290unicode_strip(PyUnicodeObject *self, PyObject *args)
5291{
5292 if (PyTuple_GET_SIZE(args) == 0)
5293 return do_strip(self, BOTHSTRIP); /* Common case */
5294 else
5295 return do_argstrip(self, BOTHSTRIP, args);
5296}
5297
5298
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005299PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005300"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005301\n\
5302Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005303If chars is given and not None, remove characters in chars instead.\n\
5304If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005305
5306static PyObject *
5307unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5308{
5309 if (PyTuple_GET_SIZE(args) == 0)
5310 return do_strip(self, LEFTSTRIP); /* Common case */
5311 else
5312 return do_argstrip(self, LEFTSTRIP, args);
5313}
5314
5315
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005316PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005317"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005318\n\
5319Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005320If chars is given and not None, remove characters in chars instead.\n\
5321If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005322
5323static PyObject *
5324unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5325{
5326 if (PyTuple_GET_SIZE(args) == 0)
5327 return do_strip(self, RIGHTSTRIP); /* Common case */
5328 else
5329 return do_argstrip(self, RIGHTSTRIP, args);
5330}
5331
5332
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333static PyObject*
5334unicode_repeat(PyUnicodeObject *str, int len)
5335{
5336 PyUnicodeObject *u;
5337 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005338 int nchars;
5339 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340
5341 if (len < 0)
5342 len = 0;
5343
Tim Peters7a29bd52001-09-12 03:03:31 +00005344 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345 /* no repeat, return original string */
5346 Py_INCREF(str);
5347 return (PyObject*) str;
5348 }
Tim Peters8f422462000-09-09 06:13:41 +00005349
5350 /* ensure # of chars needed doesn't overflow int and # of bytes
5351 * needed doesn't overflow size_t
5352 */
5353 nchars = len * str->length;
5354 if (len && nchars / len != str->length) {
5355 PyErr_SetString(PyExc_OverflowError,
5356 "repeated string is too long");
5357 return NULL;
5358 }
5359 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5360 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5361 PyErr_SetString(PyExc_OverflowError,
5362 "repeated string is too long");
5363 return NULL;
5364 }
5365 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366 if (!u)
5367 return NULL;
5368
5369 p = u->str;
5370
5371 while (len-- > 0) {
5372 Py_UNICODE_COPY(p, str->str, str->length);
5373 p += str->length;
5374 }
5375
5376 return (PyObject*) u;
5377}
5378
5379PyObject *PyUnicode_Replace(PyObject *obj,
5380 PyObject *subobj,
5381 PyObject *replobj,
5382 int maxcount)
5383{
5384 PyObject *self;
5385 PyObject *str1;
5386 PyObject *str2;
5387 PyObject *result;
5388
5389 self = PyUnicode_FromObject(obj);
5390 if (self == NULL)
5391 return NULL;
5392 str1 = PyUnicode_FromObject(subobj);
5393 if (str1 == NULL) {
5394 Py_DECREF(self);
5395 return NULL;
5396 }
5397 str2 = PyUnicode_FromObject(replobj);
5398 if (str2 == NULL) {
5399 Py_DECREF(self);
5400 Py_DECREF(str1);
5401 return NULL;
5402 }
5403 result = replace((PyUnicodeObject *)self,
5404 (PyUnicodeObject *)str1,
5405 (PyUnicodeObject *)str2,
5406 maxcount);
5407 Py_DECREF(self);
5408 Py_DECREF(str1);
5409 Py_DECREF(str2);
5410 return result;
5411}
5412
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005413PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414"S.replace (old, new[, maxsplit]) -> unicode\n\
5415\n\
5416Return a copy of S with all occurrences of substring\n\
5417old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005418given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419
5420static PyObject*
5421unicode_replace(PyUnicodeObject *self, PyObject *args)
5422{
5423 PyUnicodeObject *str1;
5424 PyUnicodeObject *str2;
5425 int maxcount = -1;
5426 PyObject *result;
5427
5428 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5429 return NULL;
5430 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5431 if (str1 == NULL)
5432 return NULL;
5433 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005434 if (str2 == NULL) {
5435 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438
5439 result = replace(self, str1, str2, maxcount);
5440
5441 Py_DECREF(str1);
5442 Py_DECREF(str2);
5443 return result;
5444}
5445
5446static
5447PyObject *unicode_repr(PyObject *unicode)
5448{
5449 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5450 PyUnicode_GET_SIZE(unicode),
5451 1);
5452}
5453
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005454PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455"S.rfind(sub [,start [,end]]) -> int\n\
5456\n\
5457Return the highest index in S where substring sub is found,\n\
5458such that sub is contained within s[start,end]. Optional\n\
5459arguments start and end are interpreted as in slice notation.\n\
5460\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005461Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462
5463static PyObject *
5464unicode_rfind(PyUnicodeObject *self, PyObject *args)
5465{
5466 PyUnicodeObject *substring;
5467 int start = 0;
5468 int end = INT_MAX;
5469 PyObject *result;
5470
Guido van Rossumb8872e62000-05-09 14:14:27 +00005471 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5472 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 return NULL;
5474 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5475 (PyObject *)substring);
5476 if (substring == NULL)
5477 return NULL;
5478
5479 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5480
5481 Py_DECREF(substring);
5482 return result;
5483}
5484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005485PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486"S.rindex(sub [,start [,end]]) -> int\n\
5487\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005488Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489
5490static PyObject *
5491unicode_rindex(PyUnicodeObject *self, PyObject *args)
5492{
5493 int result;
5494 PyUnicodeObject *substring;
5495 int start = 0;
5496 int end = INT_MAX;
5497
Guido van Rossumb8872e62000-05-09 14:14:27 +00005498 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5499 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500 return NULL;
5501 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5502 (PyObject *)substring);
5503 if (substring == NULL)
5504 return NULL;
5505
5506 result = findstring(self, substring, start, end, -1);
5507
5508 Py_DECREF(substring);
5509 if (result < 0) {
5510 PyErr_SetString(PyExc_ValueError, "substring not found");
5511 return NULL;
5512 }
5513 return PyInt_FromLong(result);
5514}
5515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005516PyDoc_STRVAR(rjust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517"S.rjust(width) -> unicode\n\
5518\n\
5519Return S right justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005520done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521
5522static PyObject *
5523unicode_rjust(PyUnicodeObject *self, PyObject *args)
5524{
5525 int width;
5526 if (!PyArg_ParseTuple(args, "i:rjust", &width))
5527 return NULL;
5528
Tim Peters7a29bd52001-09-12 03:03:31 +00005529 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530 Py_INCREF(self);
5531 return (PyObject*) self;
5532 }
5533
5534 return (PyObject*) pad(self, width - self->length, 0, ' ');
5535}
5536
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537static PyObject*
5538unicode_slice(PyUnicodeObject *self, int start, int end)
5539{
5540 /* standard clamping */
5541 if (start < 0)
5542 start = 0;
5543 if (end < 0)
5544 end = 0;
5545 if (end > self->length)
5546 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005547 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548 /* full slice, return original string */
5549 Py_INCREF(self);
5550 return (PyObject*) self;
5551 }
5552 if (start > end)
5553 start = end;
5554 /* copy slice */
5555 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5556 end - start);
5557}
5558
5559PyObject *PyUnicode_Split(PyObject *s,
5560 PyObject *sep,
5561 int maxsplit)
5562{
5563 PyObject *result;
5564
5565 s = PyUnicode_FromObject(s);
5566 if (s == NULL)
5567 return NULL;
5568 if (sep != NULL) {
5569 sep = PyUnicode_FromObject(sep);
5570 if (sep == NULL) {
5571 Py_DECREF(s);
5572 return NULL;
5573 }
5574 }
5575
5576 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5577
5578 Py_DECREF(s);
5579 Py_XDECREF(sep);
5580 return result;
5581}
5582
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005583PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584"S.split([sep [,maxsplit]]) -> list of strings\n\
5585\n\
5586Return a list of the words in S, using sep as the\n\
5587delimiter string. If maxsplit is given, at most maxsplit\n\
5588splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005589is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590
5591static PyObject*
5592unicode_split(PyUnicodeObject *self, PyObject *args)
5593{
5594 PyObject *substring = Py_None;
5595 int maxcount = -1;
5596
5597 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5598 return NULL;
5599
5600 if (substring == Py_None)
5601 return split(self, NULL, maxcount);
5602 else if (PyUnicode_Check(substring))
5603 return split(self, (PyUnicodeObject *)substring, maxcount);
5604 else
5605 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5606}
5607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005608PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00005609"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610\n\
5611Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00005612Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005613is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614
5615static PyObject*
5616unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5617{
Guido van Rossum86662912000-04-11 15:38:46 +00005618 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619
Guido van Rossum86662912000-04-11 15:38:46 +00005620 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 return NULL;
5622
Guido van Rossum86662912000-04-11 15:38:46 +00005623 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624}
5625
5626static
5627PyObject *unicode_str(PyUnicodeObject *self)
5628{
Fred Drakee4315f52000-05-09 19:53:39 +00005629 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630}
5631
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005632PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633"S.swapcase() -> unicode\n\
5634\n\
5635Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005636and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637
5638static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005639unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 return fixup(self, fixswapcase);
5642}
5643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005644PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645"S.translate(table) -> unicode\n\
5646\n\
5647Return a copy of the string S, where all characters have been mapped\n\
5648through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00005649Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5650Unmapped characters are left untouched. Characters mapped to None\n\
5651are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652
5653static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005654unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 return PyUnicode_TranslateCharmap(self->str,
5657 self->length,
5658 table,
5659 "ignore");
5660}
5661
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005662PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663"S.upper() -> unicode\n\
5664\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005665Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666
5667static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005668unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 return fixup(self, fixupper);
5671}
5672
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005673PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674"S.zfill(width) -> unicode\n\
5675\n\
5676Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005677of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678
5679static PyObject *
5680unicode_zfill(PyUnicodeObject *self, PyObject *args)
5681{
5682 int fill;
5683 PyUnicodeObject *u;
5684
5685 int width;
5686 if (!PyArg_ParseTuple(args, "i:zfill", &width))
5687 return NULL;
5688
5689 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00005690 if (PyUnicode_CheckExact(self)) {
5691 Py_INCREF(self);
5692 return (PyObject*) self;
5693 }
5694 else
5695 return PyUnicode_FromUnicode(
5696 PyUnicode_AS_UNICODE(self),
5697 PyUnicode_GET_SIZE(self)
5698 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 }
5700
5701 fill = width - self->length;
5702
5703 u = pad(self, fill, 0, '0');
5704
Walter Dörwald068325e2002-04-15 13:36:47 +00005705 if (u == NULL)
5706 return NULL;
5707
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 if (u->str[fill] == '+' || u->str[fill] == '-') {
5709 /* move sign to beginning of string */
5710 u->str[0] = u->str[fill];
5711 u->str[fill] = '0';
5712 }
5713
5714 return (PyObject*) u;
5715}
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716
5717#if 0
5718static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005719unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 return PyInt_FromLong(unicode_freelist_size);
5722}
5723#endif
5724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005725PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005726"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00005728Return True if S starts with the specified prefix, False otherwise.\n\
5729With optional start, test S beginning at that position.\n\
5730With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731
5732static PyObject *
5733unicode_startswith(PyUnicodeObject *self,
5734 PyObject *args)
5735{
5736 PyUnicodeObject *substring;
5737 int start = 0;
5738 int end = INT_MAX;
5739 PyObject *result;
5740
Guido van Rossumb8872e62000-05-09 14:14:27 +00005741 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5742 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 return NULL;
5744 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5745 (PyObject *)substring);
5746 if (substring == NULL)
5747 return NULL;
5748
Guido van Rossum77f6a652002-04-03 22:41:51 +00005749 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750
5751 Py_DECREF(substring);
5752 return result;
5753}
5754
5755
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005756PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005757"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00005759Return True if S ends with the specified suffix, False otherwise.\n\
5760With optional start, test S beginning at that position.\n\
5761With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762
5763static PyObject *
5764unicode_endswith(PyUnicodeObject *self,
5765 PyObject *args)
5766{
5767 PyUnicodeObject *substring;
5768 int start = 0;
5769 int end = INT_MAX;
5770 PyObject *result;
5771
Guido van Rossumb8872e62000-05-09 14:14:27 +00005772 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5773 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774 return NULL;
5775 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5776 (PyObject *)substring);
5777 if (substring == NULL)
5778 return NULL;
5779
Guido van Rossum77f6a652002-04-03 22:41:51 +00005780 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781
5782 Py_DECREF(substring);
5783 return result;
5784}
5785
5786
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005787
5788static PyObject *
5789unicode_getnewargs(PyUnicodeObject *v)
5790{
5791 return Py_BuildValue("(u#)", v->str, v->length);
5792}
5793
5794
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795static PyMethodDef unicode_methods[] = {
5796
5797 /* Order is according to common usage: often used methods should
5798 appear first, since lookup is done sequentially. */
5799
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005800 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5801 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5802 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5803 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5804 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5805 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5806 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5807 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5808 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5809 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5810 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5811 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5812 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005813 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005814/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5815 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5816 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5817 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005818 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005819 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005820 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005821 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5822 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5823 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5824 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5825 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5826 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5827 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5828 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5829 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5830 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5831 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5832 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5833 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5834 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005835 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00005836#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005837 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838#endif
5839
5840#if 0
5841 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005842 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843#endif
5844
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005845 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846 {NULL, NULL}
5847};
5848
Neil Schemenauerce30bc92002-11-18 16:10:18 +00005849static PyObject *
5850unicode_mod(PyObject *v, PyObject *w)
5851{
5852 if (!PyUnicode_Check(v)) {
5853 Py_INCREF(Py_NotImplemented);
5854 return Py_NotImplemented;
5855 }
5856 return PyUnicode_Format(v, w);
5857}
5858
5859static PyNumberMethods unicode_as_number = {
5860 0, /*nb_add*/
5861 0, /*nb_subtract*/
5862 0, /*nb_multiply*/
5863 0, /*nb_divide*/
5864 unicode_mod, /*nb_remainder*/
5865};
5866
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867static PySequenceMethods unicode_as_sequence = {
5868 (inquiry) unicode_length, /* sq_length */
5869 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5870 (intargfunc) unicode_repeat, /* sq_repeat */
5871 (intargfunc) unicode_getitem, /* sq_item */
5872 (intintargfunc) unicode_slice, /* sq_slice */
5873 0, /* sq_ass_item */
5874 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005875 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876};
5877
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005878static PyObject*
5879unicode_subscript(PyUnicodeObject* self, PyObject* item)
5880{
5881 if (PyInt_Check(item)) {
5882 long i = PyInt_AS_LONG(item);
5883 if (i < 0)
5884 i += PyString_GET_SIZE(self);
5885 return unicode_getitem(self, i);
5886 } else if (PyLong_Check(item)) {
5887 long i = PyLong_AsLong(item);
5888 if (i == -1 && PyErr_Occurred())
5889 return NULL;
5890 if (i < 0)
5891 i += PyString_GET_SIZE(self);
5892 return unicode_getitem(self, i);
5893 } else if (PySlice_Check(item)) {
5894 int start, stop, step, slicelength, cur, i;
5895 Py_UNICODE* source_buf;
5896 Py_UNICODE* result_buf;
5897 PyObject* result;
5898
5899 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5900 &start, &stop, &step, &slicelength) < 0) {
5901 return NULL;
5902 }
5903
5904 if (slicelength <= 0) {
5905 return PyUnicode_FromUnicode(NULL, 0);
5906 } else {
5907 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5908 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5909
5910 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5911 result_buf[i] = source_buf[cur];
5912 }
5913
5914 result = PyUnicode_FromUnicode(result_buf, slicelength);
5915 PyMem_FREE(result_buf);
5916 return result;
5917 }
5918 } else {
5919 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5920 return NULL;
5921 }
5922}
5923
5924static PyMappingMethods unicode_as_mapping = {
5925 (inquiry)unicode_length, /* mp_length */
5926 (binaryfunc)unicode_subscript, /* mp_subscript */
5927 (objobjargproc)0, /* mp_ass_subscript */
5928};
5929
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930static int
5931unicode_buffer_getreadbuf(PyUnicodeObject *self,
5932 int index,
5933 const void **ptr)
5934{
5935 if (index != 0) {
5936 PyErr_SetString(PyExc_SystemError,
5937 "accessing non-existent unicode segment");
5938 return -1;
5939 }
5940 *ptr = (void *) self->str;
5941 return PyUnicode_GET_DATA_SIZE(self);
5942}
5943
5944static int
5945unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5946 const void **ptr)
5947{
5948 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00005949 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 return -1;
5951}
5952
5953static int
5954unicode_buffer_getsegcount(PyUnicodeObject *self,
5955 int *lenp)
5956{
5957 if (lenp)
5958 *lenp = PyUnicode_GET_DATA_SIZE(self);
5959 return 1;
5960}
5961
5962static int
5963unicode_buffer_getcharbuf(PyUnicodeObject *self,
5964 int index,
5965 const void **ptr)
5966{
5967 PyObject *str;
5968
5969 if (index != 0) {
5970 PyErr_SetString(PyExc_SystemError,
5971 "accessing non-existent unicode segment");
5972 return -1;
5973 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005974 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 if (str == NULL)
5976 return -1;
5977 *ptr = (void *) PyString_AS_STRING(str);
5978 return PyString_GET_SIZE(str);
5979}
5980
5981/* Helpers for PyUnicode_Format() */
5982
5983static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005984getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985{
5986 int argidx = *p_argidx;
5987 if (argidx < arglen) {
5988 (*p_argidx)++;
5989 if (arglen < 0)
5990 return args;
5991 else
5992 return PyTuple_GetItem(args, argidx);
5993 }
5994 PyErr_SetString(PyExc_TypeError,
5995 "not enough arguments for format string");
5996 return NULL;
5997}
5998
5999#define F_LJUST (1<<0)
6000#define F_SIGN (1<<1)
6001#define F_BLANK (1<<2)
6002#define F_ALT (1<<3)
6003#define F_ZERO (1<<4)
6004
6005static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007{
6008 register int i;
6009 int len;
6010 va_list va;
6011 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013
6014 /* First, format the string as char array, then expand to Py_UNICODE
6015 array. */
6016 charbuffer = (char *)buffer;
6017 len = vsprintf(charbuffer, format, va);
6018 for (i = len - 1; i >= 0; i--)
6019 buffer[i] = (Py_UNICODE) charbuffer[i];
6020
6021 va_end(va);
6022 return len;
6023}
6024
Guido van Rossum078151d2002-08-11 04:24:12 +00006025/* XXX To save some code duplication, formatfloat/long/int could have been
6026 shared with stringobject.c, converting from 8-bit to Unicode after the
6027 formatting is done. */
6028
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029static int
6030formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006031 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 int flags,
6033 int prec,
6034 int type,
6035 PyObject *v)
6036{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006037 /* fmt = '%#.' + `prec` + `type`
6038 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 char fmt[20];
6040 double x;
6041
6042 x = PyFloat_AsDouble(v);
6043 if (x == -1.0 && PyErr_Occurred())
6044 return -1;
6045 if (prec < 0)
6046 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6048 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006049 /* Worst case length calc to ensure no buffer overrun:
6050
6051 'g' formats:
6052 fmt = %#.<prec>g
6053 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6054 for any double rep.)
6055 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6056
6057 'f' formats:
6058 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6059 len = 1 + 50 + 1 + prec = 52 + prec
6060
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006061 If prec=0 the effective precision is 1 (the leading digit is
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006062 always given), therefore increase the length by one.
6063
6064 */
6065 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6066 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006067 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006068 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006069 return -1;
6070 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006071 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6072 (flags&F_ALT) ? "#" : "",
6073 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074 return usprintf(buf, fmt, x);
6075}
6076
Tim Peters38fd5b62000-09-21 05:43:11 +00006077static PyObject*
6078formatlong(PyObject *val, int flags, int prec, int type)
6079{
6080 char *buf;
6081 int i, len;
6082 PyObject *str; /* temporary string object. */
6083 PyUnicodeObject *result;
6084
6085 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6086 if (!str)
6087 return NULL;
6088 result = _PyUnicode_New(len);
6089 for (i = 0; i < len; i++)
6090 result->str[i] = buf[i];
6091 result->str[len] = 0;
6092 Py_DECREF(str);
6093 return (PyObject*)result;
6094}
6095
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096static int
6097formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006098 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 int flags,
6100 int prec,
6101 int type,
6102 PyObject *v)
6103{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006104 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006105 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6106 * + 1 + 1
6107 * = 24
6108 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006109 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 long x;
6111
6112 x = PyInt_AsLong(v);
6113 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006114 return -1;
Guido van Rossum078151d2002-08-11 04:24:12 +00006115 if (x < 0 && type != 'd' && type != 'i') {
Guido van Rossum54df53a2002-08-14 18:38:27 +00006116 if (PyErr_Warn(PyExc_FutureWarning,
Guido van Rossum078151d2002-08-11 04:24:12 +00006117 "%u/%o/%x/%X of negative int will return "
6118 "a signed string in Python 2.4 and up") < 0)
6119 return -1;
6120 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006122 prec = 1;
6123
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006124 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006125 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
6126 */
6127 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006128 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006129 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006130 return -1;
6131 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006132
6133 if ((flags & F_ALT) &&
6134 (type == 'x' || type == 'X')) {
6135 /* When converting under %#x or %#X, there are a number
6136 * of issues that cause pain:
6137 * - when 0 is being converted, the C standard leaves off
6138 * the '0x' or '0X', which is inconsistent with other
6139 * %#x/%#X conversions and inconsistent with Python's
6140 * hex() function
6141 * - there are platforms that violate the standard and
6142 * convert 0 with the '0x' or '0X'
6143 * (Metrowerks, Compaq Tru64)
6144 * - there are platforms that give '0x' when converting
6145 * under %#X, but convert 0 in accordance with the
6146 * standard (OS/2 EMX)
6147 *
6148 * We can achieve the desired consistency by inserting our
6149 * own '0x' or '0X' prefix, and substituting %x/%X in place
6150 * of %#x/%#X.
6151 *
6152 * Note that this is the same approach as used in
6153 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006154 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006155 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
6156 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006157 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006158 else {
6159 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
6160 (flags&F_ALT) ? "#" : "",
6161 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006162 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 return usprintf(buf, fmt, x);
6164}
6165
6166static int
6167formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006168 size_t buflen,
6169 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006171 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006172 if (PyUnicode_Check(v)) {
6173 if (PyUnicode_GET_SIZE(v) != 1)
6174 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006178 else if (PyString_Check(v)) {
6179 if (PyString_GET_SIZE(v) != 1)
6180 goto onError;
6181 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183
6184 else {
6185 /* Integer input truncated to a character */
6186 long x;
6187 x = PyInt_AsLong(v);
6188 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006189 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006190#ifdef Py_UNICODE_WIDE
6191 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006192 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006193 "%c arg not in range(0x110000) "
6194 "(wide Python build)");
6195 return -1;
6196 }
6197#else
6198 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006199 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006200 "%c arg not in range(0x10000) "
6201 "(narrow Python build)");
6202 return -1;
6203 }
6204#endif
6205 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206 }
6207 buf[1] = '\0';
6208 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006209
6210 onError:
6211 PyErr_SetString(PyExc_TypeError,
6212 "%c requires int or char");
6213 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214}
6215
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006216/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6217
6218 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6219 chars are formatted. XXX This is a magic number. Each formatting
6220 routine does bounds checking to ensure no overflow, but a better
6221 solution may be to malloc a buffer of appropriate size for each
6222 format. For now, the current solution is sufficient.
6223*/
6224#define FORMATBUFLEN (size_t)120
6225
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226PyObject *PyUnicode_Format(PyObject *format,
6227 PyObject *args)
6228{
6229 Py_UNICODE *fmt, *res;
6230 int fmtcnt, rescnt, reslen, arglen, argidx;
6231 int args_owned = 0;
6232 PyUnicodeObject *result = NULL;
6233 PyObject *dict = NULL;
6234 PyObject *uformat;
6235
6236 if (format == NULL || args == NULL) {
6237 PyErr_BadInternalCall();
6238 return NULL;
6239 }
6240 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006241 if (uformat == NULL)
6242 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 fmt = PyUnicode_AS_UNICODE(uformat);
6244 fmtcnt = PyUnicode_GET_SIZE(uformat);
6245
6246 reslen = rescnt = fmtcnt + 100;
6247 result = _PyUnicode_New(reslen);
6248 if (result == NULL)
6249 goto onError;
6250 res = PyUnicode_AS_UNICODE(result);
6251
6252 if (PyTuple_Check(args)) {
6253 arglen = PyTuple_Size(args);
6254 argidx = 0;
6255 }
6256 else {
6257 arglen = -1;
6258 argidx = -2;
6259 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006260 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6261 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 dict = args;
6263
6264 while (--fmtcnt >= 0) {
6265 if (*fmt != '%') {
6266 if (--rescnt < 0) {
6267 rescnt = fmtcnt + 100;
6268 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006269 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 return NULL;
6271 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6272 --rescnt;
6273 }
6274 *res++ = *fmt++;
6275 }
6276 else {
6277 /* Got a format specifier */
6278 int flags = 0;
6279 int width = -1;
6280 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 Py_UNICODE c = '\0';
6282 Py_UNICODE fill;
6283 PyObject *v = NULL;
6284 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006285 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 Py_UNICODE sign;
6287 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006288 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289
6290 fmt++;
6291 if (*fmt == '(') {
6292 Py_UNICODE *keystart;
6293 int keylen;
6294 PyObject *key;
6295 int pcount = 1;
6296
6297 if (dict == NULL) {
6298 PyErr_SetString(PyExc_TypeError,
6299 "format requires a mapping");
6300 goto onError;
6301 }
6302 ++fmt;
6303 --fmtcnt;
6304 keystart = fmt;
6305 /* Skip over balanced parentheses */
6306 while (pcount > 0 && --fmtcnt >= 0) {
6307 if (*fmt == ')')
6308 --pcount;
6309 else if (*fmt == '(')
6310 ++pcount;
6311 fmt++;
6312 }
6313 keylen = fmt - keystart - 1;
6314 if (fmtcnt < 0 || pcount > 0) {
6315 PyErr_SetString(PyExc_ValueError,
6316 "incomplete format key");
6317 goto onError;
6318 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006319#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006320 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321 then looked up since Python uses strings to hold
6322 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006323 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324 key = PyUnicode_EncodeUTF8(keystart,
6325 keylen,
6326 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006327#else
6328 key = PyUnicode_FromUnicode(keystart, keylen);
6329#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330 if (key == NULL)
6331 goto onError;
6332 if (args_owned) {
6333 Py_DECREF(args);
6334 args_owned = 0;
6335 }
6336 args = PyObject_GetItem(dict, key);
6337 Py_DECREF(key);
6338 if (args == NULL) {
6339 goto onError;
6340 }
6341 args_owned = 1;
6342 arglen = -1;
6343 argidx = -2;
6344 }
6345 while (--fmtcnt >= 0) {
6346 switch (c = *fmt++) {
6347 case '-': flags |= F_LJUST; continue;
6348 case '+': flags |= F_SIGN; continue;
6349 case ' ': flags |= F_BLANK; continue;
6350 case '#': flags |= F_ALT; continue;
6351 case '0': flags |= F_ZERO; continue;
6352 }
6353 break;
6354 }
6355 if (c == '*') {
6356 v = getnextarg(args, arglen, &argidx);
6357 if (v == NULL)
6358 goto onError;
6359 if (!PyInt_Check(v)) {
6360 PyErr_SetString(PyExc_TypeError,
6361 "* wants int");
6362 goto onError;
6363 }
6364 width = PyInt_AsLong(v);
6365 if (width < 0) {
6366 flags |= F_LJUST;
6367 width = -width;
6368 }
6369 if (--fmtcnt >= 0)
6370 c = *fmt++;
6371 }
6372 else if (c >= '0' && c <= '9') {
6373 width = c - '0';
6374 while (--fmtcnt >= 0) {
6375 c = *fmt++;
6376 if (c < '0' || c > '9')
6377 break;
6378 if ((width*10) / 10 != width) {
6379 PyErr_SetString(PyExc_ValueError,
6380 "width too big");
6381 goto onError;
6382 }
6383 width = width*10 + (c - '0');
6384 }
6385 }
6386 if (c == '.') {
6387 prec = 0;
6388 if (--fmtcnt >= 0)
6389 c = *fmt++;
6390 if (c == '*') {
6391 v = getnextarg(args, arglen, &argidx);
6392 if (v == NULL)
6393 goto onError;
6394 if (!PyInt_Check(v)) {
6395 PyErr_SetString(PyExc_TypeError,
6396 "* wants int");
6397 goto onError;
6398 }
6399 prec = PyInt_AsLong(v);
6400 if (prec < 0)
6401 prec = 0;
6402 if (--fmtcnt >= 0)
6403 c = *fmt++;
6404 }
6405 else if (c >= '0' && c <= '9') {
6406 prec = c - '0';
6407 while (--fmtcnt >= 0) {
6408 c = Py_CHARMASK(*fmt++);
6409 if (c < '0' || c > '9')
6410 break;
6411 if ((prec*10) / 10 != prec) {
6412 PyErr_SetString(PyExc_ValueError,
6413 "prec too big");
6414 goto onError;
6415 }
6416 prec = prec*10 + (c - '0');
6417 }
6418 }
6419 } /* prec */
6420 if (fmtcnt >= 0) {
6421 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422 if (--fmtcnt >= 0)
6423 c = *fmt++;
6424 }
6425 }
6426 if (fmtcnt < 0) {
6427 PyErr_SetString(PyExc_ValueError,
6428 "incomplete format");
6429 goto onError;
6430 }
6431 if (c != '%') {
6432 v = getnextarg(args, arglen, &argidx);
6433 if (v == NULL)
6434 goto onError;
6435 }
6436 sign = 0;
6437 fill = ' ';
6438 switch (c) {
6439
6440 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006441 pbuf = formatbuf;
6442 /* presume that buffer length is at least 1 */
6443 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 len = 1;
6445 break;
6446
6447 case 's':
6448 case 'r':
6449 if (PyUnicode_Check(v) && c == 's') {
6450 temp = v;
6451 Py_INCREF(temp);
6452 }
6453 else {
6454 PyObject *unicode;
6455 if (c == 's')
6456 temp = PyObject_Str(v);
6457 else
6458 temp = PyObject_Repr(v);
6459 if (temp == NULL)
6460 goto onError;
6461 if (!PyString_Check(temp)) {
6462 /* XXX Note: this should never happen, since
6463 PyObject_Repr() and PyObject_Str() assure
6464 this */
6465 Py_DECREF(temp);
6466 PyErr_SetString(PyExc_TypeError,
6467 "%s argument has non-string str()");
6468 goto onError;
6469 }
Fred Drakee4315f52000-05-09 19:53:39 +00006470 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006472 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 "strict");
6474 Py_DECREF(temp);
6475 temp = unicode;
6476 if (temp == NULL)
6477 goto onError;
6478 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006479 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 len = PyUnicode_GET_SIZE(temp);
6481 if (prec >= 0 && len > prec)
6482 len = prec;
6483 break;
6484
6485 case 'i':
6486 case 'd':
6487 case 'u':
6488 case 'o':
6489 case 'x':
6490 case 'X':
6491 if (c == 'i')
6492 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006493 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006494 temp = formatlong(v, flags, prec, c);
6495 if (!temp)
6496 goto onError;
6497 pbuf = PyUnicode_AS_UNICODE(temp);
6498 len = PyUnicode_GET_SIZE(temp);
6499 /* unbounded ints can always produce
6500 a sign character! */
6501 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006503 else {
6504 pbuf = formatbuf;
6505 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6506 flags, prec, c, v);
6507 if (len < 0)
6508 goto onError;
6509 /* only d conversion is signed */
6510 sign = c == 'd';
6511 }
6512 if (flags & F_ZERO)
6513 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 break;
6515
6516 case 'e':
6517 case 'E':
6518 case 'f':
6519 case 'g':
6520 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006521 pbuf = formatbuf;
6522 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6523 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524 if (len < 0)
6525 goto onError;
6526 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006527 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528 fill = '0';
6529 break;
6530
6531 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006532 pbuf = formatbuf;
6533 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 if (len < 0)
6535 goto onError;
6536 break;
6537
6538 default:
6539 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006540 "unsupported format character '%c' (0x%x) "
6541 "at index %i",
Neal Norwitza0378e12002-09-13 13:47:06 +00006542 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006543 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006544 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 goto onError;
6546 }
6547 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006548 if (*pbuf == '-' || *pbuf == '+') {
6549 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 len--;
6551 }
6552 else if (flags & F_SIGN)
6553 sign = '+';
6554 else if (flags & F_BLANK)
6555 sign = ' ';
6556 else
6557 sign = 0;
6558 }
6559 if (width < len)
6560 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006561 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 reslen -= rescnt;
6563 rescnt = width + fmtcnt + 100;
6564 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006565 if (reslen < 0) {
6566 Py_DECREF(result);
6567 return PyErr_NoMemory();
6568 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006569 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 return NULL;
6571 res = PyUnicode_AS_UNICODE(result)
6572 + reslen - rescnt;
6573 }
6574 if (sign) {
6575 if (fill != ' ')
6576 *res++ = sign;
6577 rescnt--;
6578 if (width > len)
6579 width--;
6580 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006581 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6582 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006583 assert(pbuf[1] == c);
6584 if (fill != ' ') {
6585 *res++ = *pbuf++;
6586 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00006587 }
Tim Petersfff53252001-04-12 18:38:48 +00006588 rescnt -= 2;
6589 width -= 2;
6590 if (width < 0)
6591 width = 0;
6592 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00006593 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594 if (width > len && !(flags & F_LJUST)) {
6595 do {
6596 --rescnt;
6597 *res++ = fill;
6598 } while (--width > len);
6599 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006600 if (fill == ' ') {
6601 if (sign)
6602 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00006603 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006604 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006605 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00006606 *res++ = *pbuf++;
6607 *res++ = *pbuf++;
6608 }
6609 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006610 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 res += len;
6612 rescnt -= len;
6613 while (--width >= len) {
6614 --rescnt;
6615 *res++ = ' ';
6616 }
6617 if (dict && (argidx < arglen) && c != '%') {
6618 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006619 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 goto onError;
6621 }
6622 Py_XDECREF(temp);
6623 } /* '%' */
6624 } /* until end */
6625 if (argidx < arglen && !dict) {
6626 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006627 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 goto onError;
6629 }
6630
6631 if (args_owned) {
6632 Py_DECREF(args);
6633 }
6634 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006635 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006636 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 return (PyObject *)result;
6638
6639 onError:
6640 Py_XDECREF(result);
6641 Py_DECREF(uformat);
6642 if (args_owned) {
6643 Py_DECREF(args);
6644 }
6645 return NULL;
6646}
6647
6648static PyBufferProcs unicode_as_buffer = {
6649 (getreadbufferproc) unicode_buffer_getreadbuf,
6650 (getwritebufferproc) unicode_buffer_getwritebuf,
6651 (getsegcountproc) unicode_buffer_getsegcount,
6652 (getcharbufferproc) unicode_buffer_getcharbuf,
6653};
6654
Jeremy Hylton938ace62002-07-17 16:30:39 +00006655static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00006656unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6657
Tim Peters6d6c1a32001-08-02 04:15:00 +00006658static PyObject *
6659unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6660{
6661 PyObject *x = NULL;
6662 static char *kwlist[] = {"string", "encoding", "errors", 0};
6663 char *encoding = NULL;
6664 char *errors = NULL;
6665
Guido van Rossume023fe02001-08-30 03:12:59 +00006666 if (type != &PyUnicode_Type)
6667 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00006668 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6669 kwlist, &x, &encoding, &errors))
6670 return NULL;
6671 if (x == NULL)
6672 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00006673 if (encoding == NULL && errors == NULL)
6674 return PyObject_Unicode(x);
6675 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00006676 return PyUnicode_FromEncodedObject(x, encoding, errors);
6677}
6678
Guido van Rossume023fe02001-08-30 03:12:59 +00006679static PyObject *
6680unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6681{
Tim Petersaf90b3e2001-09-12 05:18:58 +00006682 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006683 int n;
6684
6685 assert(PyType_IsSubtype(type, &PyUnicode_Type));
6686 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6687 if (tmp == NULL)
6688 return NULL;
6689 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00006690 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006691 if (pnew == NULL) {
6692 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00006693 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00006694 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006695 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6696 if (pnew->str == NULL) {
6697 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006698 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006699 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00006700 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00006701 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006702 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6703 pnew->length = n;
6704 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00006705 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00006706 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006707}
6708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006709PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00006710"unicode(string [, encoding[, errors]]) -> object\n\
6711\n\
6712Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00006713encoding defaults to the current default string encoding.\n\
6714errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00006715
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716PyTypeObject PyUnicode_Type = {
6717 PyObject_HEAD_INIT(&PyType_Type)
6718 0, /* ob_size */
6719 "unicode", /* tp_name */
6720 sizeof(PyUnicodeObject), /* tp_size */
6721 0, /* tp_itemsize */
6722 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00006723 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006725 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726 0, /* tp_setattr */
6727 (cmpfunc) unicode_compare, /* tp_compare */
6728 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006729 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006731 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732 (hashfunc) unicode_hash, /* tp_hash*/
6733 0, /* tp_call*/
6734 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006735 PyObject_GenericGetAttr, /* tp_getattro */
6736 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006738 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
6739 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006740 unicode_doc, /* tp_doc */
6741 0, /* tp_traverse */
6742 0, /* tp_clear */
6743 0, /* tp_richcompare */
6744 0, /* tp_weaklistoffset */
6745 0, /* tp_iter */
6746 0, /* tp_iternext */
6747 unicode_methods, /* tp_methods */
6748 0, /* tp_members */
6749 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00006750 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006751 0, /* tp_dict */
6752 0, /* tp_descr_get */
6753 0, /* tp_descr_set */
6754 0, /* tp_dictoffset */
6755 0, /* tp_init */
6756 0, /* tp_alloc */
6757 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006758 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759};
6760
6761/* Initialize the Unicode implementation */
6762
Thomas Wouters78890102000-07-22 19:25:51 +00006763void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006765 int i;
6766
Fred Drakee4315f52000-05-09 19:53:39 +00006767 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006768 unicode_freelist = NULL;
6769 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00006771 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006772 for (i = 0; i < 256; i++)
6773 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00006774 if (PyType_Ready(&PyUnicode_Type) < 0)
6775 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776}
6777
6778/* Finalize the Unicode implementation */
6779
6780void
Thomas Wouters78890102000-07-22 19:25:51 +00006781_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006783 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006784 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00006786 Py_XDECREF(unicode_empty);
6787 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006788
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006789 for (i = 0; i < 256; i++) {
6790 if (unicode_latin1[i]) {
6791 Py_DECREF(unicode_latin1[i]);
6792 unicode_latin1[i] = NULL;
6793 }
6794 }
6795
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006796 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797 PyUnicodeObject *v = u;
6798 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006799 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00006800 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006801 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006802 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006804 unicode_freelist = NULL;
6805 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006807
6808/*
6809Local variables:
6810c-basic-offset: 4
6811indent-tabs-mode: nil
6812End:
6813*/