blob: 7ba9547b1f747b8f23a663c9cbda8bb72b2c5567 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000222 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000279 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000280 *unicode = (PyObject *)w;
281 return 0;
282 }
283
284 /* Note that we don't have to modify *unicode for unshared Unicode
285 objects, since we can modify them in-place. */
286 return unicode_resize(v, length);
287}
288
289/* Internal API for use in unicodeobject.c only ! */
290#define _PyUnicode_Resize(unicodevar, length) \
291 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
292
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
294 int size)
295{
296 PyUnicodeObject *unicode;
297
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000298 /* If the Unicode data is known at construction time, we can apply
299 some optimizations which share commonly used objects. */
300 if (u != NULL) {
301
302 /* Optimization for empty strings */
303 if (size == 0 && unicode_empty != NULL) {
304 Py_INCREF(unicode_empty);
305 return (PyObject *)unicode_empty;
306 }
307
308 /* Single character Unicode objects in the Latin-1 range are
309 shared when using this constructor */
310 if (size == 1 && *u < 256) {
311 unicode = unicode_latin1[*u];
312 if (!unicode) {
313 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000314 if (!unicode)
315 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000316 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000317 unicode_latin1[*u] = unicode;
318 }
319 Py_INCREF(unicode);
320 return (PyObject *)unicode;
321 }
322 }
323
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 unicode = _PyUnicode_New(size);
325 if (!unicode)
326 return NULL;
327
328 /* Copy the Unicode data into the new object */
329 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331
332 return (PyObject *)unicode;
333}
334
335#ifdef HAVE_WCHAR_H
336
337PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
338 int size)
339{
340 PyUnicodeObject *unicode;
341
342 if (w == NULL) {
343 PyErr_BadInternalCall();
344 return NULL;
345 }
346
347 unicode = _PyUnicode_New(size);
348 if (!unicode)
349 return NULL;
350
351 /* Copy the wchar_t data into the new object */
352#ifdef HAVE_USABLE_WCHAR_T
353 memcpy(unicode->str, w, size * sizeof(wchar_t));
354#else
355 {
356 register Py_UNICODE *u;
357 register int i;
358 u = PyUnicode_AS_UNICODE(unicode);
359 for (i = size; i >= 0; i--)
360 *u++ = *w++;
361 }
362#endif
363
364 return (PyObject *)unicode;
365}
366
367int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
368 register wchar_t *w,
369 int size)
370{
371 if (unicode == NULL) {
372 PyErr_BadInternalCall();
373 return -1;
374 }
375 if (size > PyUnicode_GET_SIZE(unicode))
376 size = PyUnicode_GET_SIZE(unicode);
377#ifdef HAVE_USABLE_WCHAR_T
378 memcpy(w, unicode->str, size * sizeof(wchar_t));
379#else
380 {
381 register Py_UNICODE *u;
382 register int i;
383 u = PyUnicode_AS_UNICODE(unicode);
384 for (i = size; i >= 0; i--)
385 *w++ = *u++;
386 }
387#endif
388
389 return size;
390}
391
392#endif
393
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000394PyObject *PyUnicode_FromOrdinal(int ordinal)
395{
396 Py_UNICODE s[2];
397
398#ifdef Py_UNICODE_WIDE
399 if (ordinal < 0 || ordinal > 0x10ffff) {
400 PyErr_SetString(PyExc_ValueError,
401 "unichr() arg not in range(0x110000) "
402 "(wide Python build)");
403 return NULL;
404 }
405#else
406 if (ordinal < 0 || ordinal > 0xffff) {
407 PyErr_SetString(PyExc_ValueError,
408 "unichr() arg not in range(0x10000) "
409 "(narrow Python build)");
410 return NULL;
411 }
412#endif
413
414 if (ordinal <= 0xffff) {
415 /* UCS-2 character */
416 s[0] = (Py_UNICODE) ordinal;
417 return PyUnicode_FromUnicode(s, 1);
418 }
419 else {
420#ifndef Py_UNICODE_WIDE
421 /* UCS-4 character. store as two surrogate characters */
422 ordinal -= 0x10000L;
423 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
424 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
425 return PyUnicode_FromUnicode(s, 2);
426#else
427 s[0] = (Py_UNICODE)ordinal;
428 return PyUnicode_FromUnicode(s, 1);
429#endif
430 }
431}
432
Guido van Rossumd57fd912000-03-10 22:53:23 +0000433PyObject *PyUnicode_FromObject(register PyObject *obj)
434{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000435 /* XXX Perhaps we should make this API an alias of
436 PyObject_Unicode() instead ?! */
437 if (PyUnicode_CheckExact(obj)) {
438 Py_INCREF(obj);
439 return obj;
440 }
441 if (PyUnicode_Check(obj)) {
442 /* For a Unicode subtype that's not a Unicode object,
443 return a true Unicode object with the same data. */
444 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
445 PyUnicode_GET_SIZE(obj));
446 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
448}
449
450PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
451 const char *encoding,
452 const char *errors)
453{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000454 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000456 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457
458 if (obj == NULL) {
459 PyErr_BadInternalCall();
460 return NULL;
461 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000462
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000463#if 0
464 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000465 that no encodings is given and then redirect to
466 PyObject_Unicode() which then applies the additional logic for
467 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000468
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000469 NOTE: This API should really only be used for object which
470 represent *encoded* Unicode !
471
472 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000473 if (PyUnicode_Check(obj)) {
474 if (encoding) {
475 PyErr_SetString(PyExc_TypeError,
476 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000477 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000478 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000479 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000480 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000481#else
482 if (PyUnicode_Check(obj)) {
483 PyErr_SetString(PyExc_TypeError,
484 "decoding Unicode is not supported");
485 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000486 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000487#endif
488
489 /* Coerce object */
490 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000491 s = PyString_AS_STRING(obj);
492 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000493 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000494 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
495 /* Overwrite the error message with something more useful in
496 case of a TypeError. */
497 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000498 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000499 "coercing to Unicode: need string or buffer, "
500 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000501 obj->ob_type->tp_name);
502 goto onError;
503 }
504
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000505 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 if (len == 0) {
507 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000510 else
511 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000512
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000513 return v;
514
515 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000517}
518
519PyObject *PyUnicode_Decode(const char *s,
520 int size,
521 const char *encoding,
522 const char *errors)
523{
524 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000525
526 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000527 encoding = PyUnicode_GetDefaultEncoding();
528
529 /* Shortcuts for common default encodings */
530 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000531 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000532 else if (strcmp(encoding, "latin-1") == 0)
533 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000534#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
535 else if (strcmp(encoding, "mbcs") == 0)
536 return PyUnicode_DecodeMBCS(s, size, errors);
537#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000538 else if (strcmp(encoding, "ascii") == 0)
539 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000540
541 /* Decode via the codec registry */
542 buffer = PyBuffer_FromMemory((void *)s, size);
543 if (buffer == NULL)
544 goto onError;
545 unicode = PyCodec_Decode(buffer, encoding, errors);
546 if (unicode == NULL)
547 goto onError;
548 if (!PyUnicode_Check(unicode)) {
549 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000550 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000551 unicode->ob_type->tp_name);
552 Py_DECREF(unicode);
553 goto onError;
554 }
555 Py_DECREF(buffer);
556 return unicode;
557
558 onError:
559 Py_XDECREF(buffer);
560 return NULL;
561}
562
563PyObject *PyUnicode_Encode(const Py_UNICODE *s,
564 int size,
565 const char *encoding,
566 const char *errors)
567{
568 PyObject *v, *unicode;
569
570 unicode = PyUnicode_FromUnicode(s, size);
571 if (unicode == NULL)
572 return NULL;
573 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
574 Py_DECREF(unicode);
575 return v;
576}
577
578PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
579 const char *encoding,
580 const char *errors)
581{
582 PyObject *v;
583
584 if (!PyUnicode_Check(unicode)) {
585 PyErr_BadArgument();
586 goto onError;
587 }
Fred Drakee4315f52000-05-09 19:53:39 +0000588
589 if (encoding == NULL)
590 encoding = PyUnicode_GetDefaultEncoding();
591
592 /* Shortcuts for common default encodings */
593 if (errors == NULL) {
594 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000595 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000596 else if (strcmp(encoding, "latin-1") == 0)
597 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000598#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
599 else if (strcmp(encoding, "mbcs") == 0)
600 return PyUnicode_AsMBCSString(unicode);
601#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000602 else if (strcmp(encoding, "ascii") == 0)
603 return PyUnicode_AsASCIIString(unicode);
604 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000605
606 /* Encode via the codec registry */
607 v = PyCodec_Encode(unicode, encoding, errors);
608 if (v == NULL)
609 goto onError;
610 /* XXX Should we really enforce this ? */
611 if (!PyString_Check(v)) {
612 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000613 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 v->ob_type->tp_name);
615 Py_DECREF(v);
616 goto onError;
617 }
618 return v;
619
620 onError:
621 return NULL;
622}
623
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000624PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
625 const char *errors)
626{
627 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
628
629 if (v)
630 return v;
631 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
632 if (v && errors == NULL)
633 ((PyUnicodeObject *)unicode)->defenc = v;
634 return v;
635}
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
638{
639 if (!PyUnicode_Check(unicode)) {
640 PyErr_BadArgument();
641 goto onError;
642 }
643 return PyUnicode_AS_UNICODE(unicode);
644
645 onError:
646 return NULL;
647}
648
649int PyUnicode_GetSize(PyObject *unicode)
650{
651 if (!PyUnicode_Check(unicode)) {
652 PyErr_BadArgument();
653 goto onError;
654 }
655 return PyUnicode_GET_SIZE(unicode);
656
657 onError:
658 return -1;
659}
660
Thomas Wouters78890102000-07-22 19:25:51 +0000661const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000662{
663 return unicode_default_encoding;
664}
665
666int PyUnicode_SetDefaultEncoding(const char *encoding)
667{
668 PyObject *v;
669
670 /* Make sure the encoding is valid. As side effect, this also
671 loads the encoding into the codec registry cache. */
672 v = _PyCodec_Lookup(encoding);
673 if (v == NULL)
674 goto onError;
675 Py_DECREF(v);
676 strncpy(unicode_default_encoding,
677 encoding,
678 sizeof(unicode_default_encoding));
679 return 0;
680
681 onError:
682 return -1;
683}
684
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000685/* error handling callback helper:
686 build arguments, call the callback and check the arguments,
687 if no exception occured, copy the replacement to the output
688 and adjust various state variables.
689 return 0 on success, -1 on error
690*/
691
692static
693int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
694 const char *encoding, const char *reason,
695 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
696 PyObject **output, int *outpos, Py_UNICODE **outptr)
697{
698 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
699
700 PyObject *restuple = NULL;
701 PyObject *repunicode = NULL;
702 int outsize = PyUnicode_GET_SIZE(*output);
703 int requiredsize;
704 int newpos;
705 Py_UNICODE *repptr;
706 int repsize;
707 int res = -1;
708
709 if (*errorHandler == NULL) {
710 *errorHandler = PyCodec_LookupError(errors);
711 if (*errorHandler == NULL)
712 goto onError;
713 }
714
715 if (*exceptionObject == NULL) {
716 *exceptionObject = PyUnicodeDecodeError_Create(
717 encoding, input, insize, *startinpos, *endinpos, reason);
718 if (*exceptionObject == NULL)
719 goto onError;
720 }
721 else {
722 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
723 goto onError;
724 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
725 goto onError;
726 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
727 goto onError;
728 }
729
730 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
731 if (restuple == NULL)
732 goto onError;
733 if (!PyTuple_Check(restuple)) {
734 PyErr_Format(PyExc_TypeError, &argparse[4]);
735 goto onError;
736 }
737 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
738 goto onError;
739 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000740 newpos = insize+newpos;
741 if (newpos<0 || newpos>insize) {
742 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
743 goto onError;
744 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000745
746 /* need more space? (at least enough for what we
747 have+the replacement+the rest of the string (starting
748 at the new input position), so we won't have to check space
749 when there are no errors in the rest of the string) */
750 repptr = PyUnicode_AS_UNICODE(repunicode);
751 repsize = PyUnicode_GET_SIZE(repunicode);
752 requiredsize = *outpos + repsize + insize-newpos;
753 if (requiredsize > outsize) {
754 if (requiredsize<2*outsize)
755 requiredsize = 2*outsize;
756 if (PyUnicode_Resize(output, requiredsize))
757 goto onError;
758 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
759 }
760 *endinpos = newpos;
761 *inptr = input + newpos;
762 Py_UNICODE_COPY(*outptr, repptr, repsize);
763 *outptr += repsize;
764 *outpos += repsize;
765 /* we made it! */
766 res = 0;
767
768 onError:
769 Py_XDECREF(restuple);
770 return res;
771}
772
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000773/* --- UTF-7 Codec -------------------------------------------------------- */
774
775/* see RFC2152 for details */
776
777static
778char utf7_special[128] = {
779 /* indicate whether a UTF-7 character is special i.e. cannot be directly
780 encoded:
781 0 - not special
782 1 - special
783 2 - whitespace (optional)
784 3 - RFC2152 Set O (optional) */
785 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
786 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
787 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
788 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
789 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
790 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
791 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
792 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
793
794};
795
796#define SPECIAL(c, encodeO, encodeWS) \
797 (((c)>127 || utf7_special[(c)] == 1) || \
798 (encodeWS && (utf7_special[(c)] == 2)) || \
799 (encodeO && (utf7_special[(c)] == 3)))
800
801#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
802#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
803#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
804 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
805
806#define ENCODE(out, ch, bits) \
807 while (bits >= 6) { \
808 *out++ = B64(ch >> (bits-6)); \
809 bits -= 6; \
810 }
811
812#define DECODE(out, ch, bits, surrogate) \
813 while (bits >= 16) { \
814 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
815 bits -= 16; \
816 if (surrogate) { \
817 /* We have already generated an error for the high surrogate
818 so let's not bother seeing if the low surrogate is correct or not */\
819 surrogate = 0; \
820 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
821 /* This is a surrogate pair. Unfortunately we can't represent \
822 it in a 16-bit character */ \
823 surrogate = 1; \
824 errmsg = "code pairs are not supported"; \
825 goto utf7Error; \
826 } else { \
827 *out++ = outCh; \
828 } \
829 } \
830
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000831PyObject *PyUnicode_DecodeUTF7(const char *s,
832 int size,
833 const char *errors)
834{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000835 const char *starts = s;
836 int startinpos;
837 int endinpos;
838 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000839 const char *e;
840 PyUnicodeObject *unicode;
841 Py_UNICODE *p;
842 const char *errmsg = "";
843 int inShift = 0;
844 unsigned int bitsleft = 0;
845 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000846 int surrogate = 0;
847 PyObject *errorHandler = NULL;
848 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000849
850 unicode = _PyUnicode_New(size);
851 if (!unicode)
852 return NULL;
853 if (size == 0)
854 return (PyObject *)unicode;
855
856 p = unicode->str;
857 e = s + size;
858
859 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000860 Py_UNICODE ch;
861 restart:
862 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000863
864 if (inShift) {
865 if ((ch == '-') || !B64CHAR(ch)) {
866 inShift = 0;
867 s++;
868
869 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
870 if (bitsleft >= 6) {
871 /* The shift sequence has a partial character in it. If
872 bitsleft < 6 then we could just classify it as padding
873 but that is not the case here */
874
875 errmsg = "partial character in shift sequence";
876 goto utf7Error;
877 }
878 /* According to RFC2152 the remaining bits should be zero. We
879 choose to signal an error/insert a replacement character
880 here so indicate the potential of a misencoded character. */
881
882 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
883 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
884 errmsg = "non-zero padding bits in shift sequence";
885 goto utf7Error;
886 }
887
888 if (ch == '-') {
889 if ((s < e) && (*(s) == '-')) {
890 *p++ = '-';
891 inShift = 1;
892 }
893 } else if (SPECIAL(ch,0,0)) {
894 errmsg = "unexpected special character";
895 goto utf7Error;
896 } else {
897 *p++ = ch;
898 }
899 } else {
900 charsleft = (charsleft << 6) | UB64(ch);
901 bitsleft += 6;
902 s++;
903 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
904 }
905 }
906 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000907 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000908 s++;
909 if (s < e && *s == '-') {
910 s++;
911 *p++ = '+';
912 } else
913 {
914 inShift = 1;
915 bitsleft = 0;
916 }
917 }
918 else if (SPECIAL(ch,0,0)) {
919 errmsg = "unexpected special character";
920 s++;
921 goto utf7Error;
922 }
923 else {
924 *p++ = ch;
925 s++;
926 }
927 continue;
928 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000929 outpos = p-PyUnicode_AS_UNICODE(unicode);
930 endinpos = s-starts;
931 if (unicode_decode_call_errorhandler(
932 errors, &errorHandler,
933 "utf7", errmsg,
934 starts, size, &startinpos, &endinpos, &exc, &s,
935 (PyObject **)&unicode, &outpos, &p))
936 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000937 }
938
939 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000940 outpos = p-PyUnicode_AS_UNICODE(unicode);
941 endinpos = size;
942 if (unicode_decode_call_errorhandler(
943 errors, &errorHandler,
944 "utf7", "unterminated shift sequence",
945 starts, size, &startinpos, &endinpos, &exc, &s,
946 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000947 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000948 if (s < e)
949 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950 }
951
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000952 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000953 goto onError;
954
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000955 Py_XDECREF(errorHandler);
956 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000957 return (PyObject *)unicode;
958
959onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000960 Py_XDECREF(errorHandler);
961 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000962 Py_DECREF(unicode);
963 return NULL;
964}
965
966
967PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
968 int size,
969 int encodeSetO,
970 int encodeWhiteSpace,
971 const char *errors)
972{
973 PyObject *v;
974 /* It might be possible to tighten this worst case */
975 unsigned int cbAllocated = 5 * size;
976 int inShift = 0;
977 int i = 0;
978 unsigned int bitsleft = 0;
979 unsigned long charsleft = 0;
980 char * out;
981 char * start;
982
983 if (size == 0)
984 return PyString_FromStringAndSize(NULL, 0);
985
986 v = PyString_FromStringAndSize(NULL, cbAllocated);
987 if (v == NULL)
988 return NULL;
989
990 start = out = PyString_AS_STRING(v);
991 for (;i < size; ++i) {
992 Py_UNICODE ch = s[i];
993
994 if (!inShift) {
995 if (ch == '+') {
996 *out++ = '+';
997 *out++ = '-';
998 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
999 charsleft = ch;
1000 bitsleft = 16;
1001 *out++ = '+';
1002 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1003 inShift = bitsleft > 0;
1004 } else {
1005 *out++ = (char) ch;
1006 }
1007 } else {
1008 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1009 *out++ = B64(charsleft << (6-bitsleft));
1010 charsleft = 0;
1011 bitsleft = 0;
1012 /* Characters not in the BASE64 set implicitly unshift the sequence
1013 so no '-' is required, except if the character is itself a '-' */
1014 if (B64CHAR(ch) || ch == '-') {
1015 *out++ = '-';
1016 }
1017 inShift = 0;
1018 *out++ = (char) ch;
1019 } else {
1020 bitsleft += 16;
1021 charsleft = (charsleft << 16) | ch;
1022 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1023
1024 /* If the next character is special then we dont' need to terminate
1025 the shift sequence. If the next character is not a BASE64 character
1026 or '-' then the shift sequence will be terminated implicitly and we
1027 don't have to insert a '-'. */
1028
1029 if (bitsleft == 0) {
1030 if (i + 1 < size) {
1031 Py_UNICODE ch2 = s[i+1];
1032
1033 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1034
1035 } else if (B64CHAR(ch2) || ch2 == '-') {
1036 *out++ = '-';
1037 inShift = 0;
1038 } else {
1039 inShift = 0;
1040 }
1041
1042 }
1043 else {
1044 *out++ = '-';
1045 inShift = 0;
1046 }
1047 }
1048 }
1049 }
1050 }
1051 if (bitsleft) {
1052 *out++= B64(charsleft << (6-bitsleft) );
1053 *out++ = '-';
1054 }
1055
Tim Peters5de98422002-04-27 18:44:32 +00001056 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001057 return v;
1058}
1059
1060#undef SPECIAL
1061#undef B64
1062#undef B64CHAR
1063#undef UB64
1064#undef ENCODE
1065#undef DECODE
1066
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067/* --- UTF-8 Codec -------------------------------------------------------- */
1068
1069static
1070char utf8_code_length[256] = {
1071 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1072 illegal prefix. see RFC 2279 for details */
1073 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1074 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1075 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1076 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1077 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1078 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1079 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1080 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1081 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1082 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1083 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1084 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1085 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1086 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1087 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1088 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1089};
1090
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091PyObject *PyUnicode_DecodeUTF8(const char *s,
1092 int size,
1093 const char *errors)
1094{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001095 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001097 int startinpos;
1098 int endinpos;
1099 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100 const char *e;
1101 PyUnicodeObject *unicode;
1102 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001103 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001104 PyObject *errorHandler = NULL;
1105 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001106
1107 /* Note: size will always be longer than the resulting Unicode
1108 character count */
1109 unicode = _PyUnicode_New(size);
1110 if (!unicode)
1111 return NULL;
1112 if (size == 0)
1113 return (PyObject *)unicode;
1114
1115 /* Unpack UTF-8 encoded data */
1116 p = unicode->str;
1117 e = s + size;
1118
1119 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001120 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121
1122 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001123 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001124 s++;
1125 continue;
1126 }
1127
1128 n = utf8_code_length[ch];
1129
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001130 if (s + n > e) {
1131 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001132 startinpos = s-starts;
1133 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001134 goto utf8Error;
1135 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001136
1137 switch (n) {
1138
1139 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001140 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001141 startinpos = s-starts;
1142 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001143 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144
1145 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001146 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001147 startinpos = s-starts;
1148 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001149 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150
1151 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001152 if ((s[1] & 0xc0) != 0x80) {
1153 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001154 startinpos = s-starts;
1155 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001156 goto utf8Error;
1157 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001159 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001160 startinpos = s-starts;
1161 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001162 errmsg = "illegal encoding";
1163 goto utf8Error;
1164 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001166 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 break;
1168
1169 case 3:
1170 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001171 (s[2] & 0xc0) != 0x80) {
1172 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001173 startinpos = s-starts;
1174 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001175 goto utf8Error;
1176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001178 if (ch < 0x0800) {
1179 /* Note: UTF-8 encodings of surrogates are considered
1180 legal UTF-8 sequences;
1181
1182 XXX For wide builds (UCS-4) we should probably try
1183 to recombine the surrogates into a single code
1184 unit.
1185 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001186 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001187 startinpos = s-starts;
1188 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001189 goto utf8Error;
1190 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001192 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001193 break;
1194
1195 case 4:
1196 if ((s[1] & 0xc0) != 0x80 ||
1197 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001198 (s[3] & 0xc0) != 0x80) {
1199 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001200 startinpos = s-starts;
1201 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001202 goto utf8Error;
1203 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001204 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1205 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1206 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001207 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001208 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001209 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001210 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001211 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001212 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001213 startinpos = s-starts;
1214 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001215 goto utf8Error;
1216 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001217#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001218 *p++ = (Py_UNICODE)ch;
1219#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001220 /* compute and append the two surrogates: */
1221
1222 /* translate from 10000..10FFFF to 0..FFFF */
1223 ch -= 0x10000;
1224
1225 /* high surrogate = top 10 bits added to D800 */
1226 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1227
1228 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001229 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001230#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 break;
1232
1233 default:
1234 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001235 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001236 startinpos = s-starts;
1237 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001238 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 }
1240 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001241 continue;
1242
1243 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001244 outpos = p-PyUnicode_AS_UNICODE(unicode);
1245 if (unicode_decode_call_errorhandler(
1246 errors, &errorHandler,
1247 "utf8", errmsg,
1248 starts, size, &startinpos, &endinpos, &exc, &s,
1249 (PyObject **)&unicode, &outpos, &p))
1250 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 }
1252
1253 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001254 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255 goto onError;
1256
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001257 Py_XDECREF(errorHandler);
1258 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001259 return (PyObject *)unicode;
1260
1261onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001262 Py_XDECREF(errorHandler);
1263 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 Py_DECREF(unicode);
1265 return NULL;
1266}
1267
Tim Peters602f7402002-04-27 18:03:26 +00001268/* Allocation strategy: if the string is short, convert into a stack buffer
1269 and allocate exactly as much space needed at the end. Else allocate the
1270 maximum possible needed (4 result bytes per Unicode character), and return
1271 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001272*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001273PyObject *
1274PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1275 int size,
1276 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277{
Tim Peters602f7402002-04-27 18:03:26 +00001278#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001279
Tim Peters602f7402002-04-27 18:03:26 +00001280 int i; /* index into s of next input byte */
1281 PyObject *v; /* result string object */
1282 char *p; /* next free byte in output buffer */
1283 int nallocated; /* number of result bytes allocated */
1284 int nneeded; /* number of result bytes needed */
1285 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001286
Tim Peters602f7402002-04-27 18:03:26 +00001287 assert(s != NULL);
1288 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289
Tim Peters602f7402002-04-27 18:03:26 +00001290 if (size <= MAX_SHORT_UNICHARS) {
1291 /* Write into the stack buffer; nallocated can't overflow.
1292 * At the end, we'll allocate exactly as much heap space as it
1293 * turns out we need.
1294 */
1295 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1296 v = NULL; /* will allocate after we're done */
1297 p = stackbuf;
1298 }
1299 else {
1300 /* Overallocate on the heap, and give the excess back at the end. */
1301 nallocated = size * 4;
1302 if (nallocated / 4 != size) /* overflow! */
1303 return PyErr_NoMemory();
1304 v = PyString_FromStringAndSize(NULL, nallocated);
1305 if (v == NULL)
1306 return NULL;
1307 p = PyString_AS_STRING(v);
1308 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001309
Tim Peters602f7402002-04-27 18:03:26 +00001310 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001311 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001312
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001313 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001314 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001315 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001316
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001318 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001319 *p++ = (char)(0xc0 | (ch >> 6));
1320 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001321 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001322 else {
Tim Peters602f7402002-04-27 18:03:26 +00001323 /* Encode UCS2 Unicode ordinals */
1324 if (ch < 0x10000) {
1325 /* Special case: check for high surrogate */
1326 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1327 Py_UCS4 ch2 = s[i];
1328 /* Check for low surrogate and combine the two to
1329 form a UCS4 value */
1330 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001331 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001332 i++;
1333 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001334 }
Tim Peters602f7402002-04-27 18:03:26 +00001335 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001336 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001337 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001338 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1339 *p++ = (char)(0x80 | (ch & 0x3f));
1340 continue;
1341 }
1342encodeUCS4:
1343 /* Encode UCS4 Unicode ordinals */
1344 *p++ = (char)(0xf0 | (ch >> 18));
1345 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1346 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1347 *p++ = (char)(0x80 | (ch & 0x3f));
1348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001350
Tim Peters602f7402002-04-27 18:03:26 +00001351 if (v == NULL) {
1352 /* This was stack allocated. */
1353 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1354 assert(nneeded <= nallocated);
1355 v = PyString_FromStringAndSize(stackbuf, nneeded);
1356 }
1357 else {
1358 /* Cut back to size actually needed. */
1359 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1360 assert(nneeded <= nallocated);
1361 _PyString_Resize(&v, nneeded);
1362 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001363 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001364
Tim Peters602f7402002-04-27 18:03:26 +00001365#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366}
1367
Guido van Rossumd57fd912000-03-10 22:53:23 +00001368PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1369{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370 if (!PyUnicode_Check(unicode)) {
1371 PyErr_BadArgument();
1372 return NULL;
1373 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001374 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1375 PyUnicode_GET_SIZE(unicode),
1376 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377}
1378
1379/* --- UTF-16 Codec ------------------------------------------------------- */
1380
Tim Peters772747b2001-08-09 22:21:55 +00001381PyObject *
1382PyUnicode_DecodeUTF16(const char *s,
1383 int size,
1384 const char *errors,
1385 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001386{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001387 const char *starts = s;
1388 int startinpos;
1389 int endinpos;
1390 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001391 PyUnicodeObject *unicode;
1392 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001393 const unsigned char *q, *e;
1394 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001395 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001396 /* Offsets from q for retrieving byte pairs in the right order. */
1397#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1398 int ihi = 1, ilo = 0;
1399#else
1400 int ihi = 0, ilo = 1;
1401#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001402 PyObject *errorHandler = NULL;
1403 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404
1405 /* Note: size will always be longer than the resulting Unicode
1406 character count */
1407 unicode = _PyUnicode_New(size);
1408 if (!unicode)
1409 return NULL;
1410 if (size == 0)
1411 return (PyObject *)unicode;
1412
1413 /* Unpack UTF-16 encoded data */
1414 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001415 q = (unsigned char *)s;
1416 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417
1418 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001419 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001420
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001421 /* Check for BOM marks (U+FEFF) in the input and adjust current
1422 byte order setting accordingly. In native mode, the leading BOM
1423 mark is skipped, in all other modes, it is copied to the output
1424 stream as-is (giving a ZWNBSP character). */
1425 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001426 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001427#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001428 if (bom == 0xFEFF) {
1429 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001430 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001431 }
1432 else if (bom == 0xFFFE) {
1433 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001434 bo = 1;
1435 }
1436#else
Tim Peters772747b2001-08-09 22:21:55 +00001437 if (bom == 0xFEFF) {
1438 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001439 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001440 }
1441 else if (bom == 0xFFFE) {
1442 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001443 bo = -1;
1444 }
1445#endif
1446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447
Tim Peters772747b2001-08-09 22:21:55 +00001448 if (bo == -1) {
1449 /* force LE */
1450 ihi = 1;
1451 ilo = 0;
1452 }
1453 else if (bo == 1) {
1454 /* force BE */
1455 ihi = 0;
1456 ilo = 1;
1457 }
1458
1459 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001460 Py_UNICODE ch;
1461 /* remaing bytes at the end? (size should be even) */
1462 if (e-q<2) {
1463 errmsg = "truncated data";
1464 startinpos = ((const char *)q)-starts;
1465 endinpos = ((const char *)e)-starts;
1466 goto utf16Error;
1467 /* The remaining input chars are ignored if the callback
1468 chooses to skip the input */
1469 }
1470 ch = (q[ihi] << 8) | q[ilo];
1471
Tim Peters772747b2001-08-09 22:21:55 +00001472 q += 2;
1473
Guido van Rossumd57fd912000-03-10 22:53:23 +00001474 if (ch < 0xD800 || ch > 0xDFFF) {
1475 *p++ = ch;
1476 continue;
1477 }
1478
1479 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001480 if (q >= e) {
1481 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001482 startinpos = (((const char *)q)-2)-starts;
1483 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001484 goto utf16Error;
1485 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001486 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001487 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1488 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001489 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001490#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001491 *p++ = ch;
1492 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001493#else
1494 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001495#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001496 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001497 }
1498 else {
1499 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001500 startinpos = (((const char *)q)-4)-starts;
1501 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001502 goto utf16Error;
1503 }
1504
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001506 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001507 startinpos = (((const char *)q)-2)-starts;
1508 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001509 /* Fall through to report the error */
1510
1511 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001512 outpos = p-PyUnicode_AS_UNICODE(unicode);
1513 if (unicode_decode_call_errorhandler(
1514 errors, &errorHandler,
1515 "utf16", errmsg,
1516 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1517 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001518 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001519 }
1520
1521 if (byteorder)
1522 *byteorder = bo;
1523
1524 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001525 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 goto onError;
1527
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001528 Py_XDECREF(errorHandler);
1529 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001530 return (PyObject *)unicode;
1531
1532onError:
1533 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001534 Py_XDECREF(errorHandler);
1535 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001536 return NULL;
1537}
1538
Tim Peters772747b2001-08-09 22:21:55 +00001539PyObject *
1540PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1541 int size,
1542 const char *errors,
1543 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001544{
1545 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001546 unsigned char *p;
1547 int i, pairs;
1548 /* Offsets from p for storing byte pairs in the right order. */
1549#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1550 int ihi = 1, ilo = 0;
1551#else
1552 int ihi = 0, ilo = 1;
1553#endif
1554
1555#define STORECHAR(CH) \
1556 do { \
1557 p[ihi] = ((CH) >> 8) & 0xff; \
1558 p[ilo] = (CH) & 0xff; \
1559 p += 2; \
1560 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001562 for (i = pairs = 0; i < size; i++)
1563 if (s[i] >= 0x10000)
1564 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001566 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001567 if (v == NULL)
1568 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569
Tim Peters772747b2001-08-09 22:21:55 +00001570 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001571 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001572 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001573 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001574 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001575
1576 if (byteorder == -1) {
1577 /* force LE */
1578 ihi = 1;
1579 ilo = 0;
1580 }
1581 else if (byteorder == 1) {
1582 /* force BE */
1583 ihi = 0;
1584 ilo = 1;
1585 }
1586
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001587 while (size-- > 0) {
1588 Py_UNICODE ch = *s++;
1589 Py_UNICODE ch2 = 0;
1590 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001591 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1592 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593 }
Tim Peters772747b2001-08-09 22:21:55 +00001594 STORECHAR(ch);
1595 if (ch2)
1596 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001597 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001598 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001599#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001600}
1601
1602PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1603{
1604 if (!PyUnicode_Check(unicode)) {
1605 PyErr_BadArgument();
1606 return NULL;
1607 }
1608 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1609 PyUnicode_GET_SIZE(unicode),
1610 NULL,
1611 0);
1612}
1613
1614/* --- Unicode Escape Codec ----------------------------------------------- */
1615
Fredrik Lundh06d12682001-01-24 07:59:11 +00001616static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001617
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1619 int size,
1620 const char *errors)
1621{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001622 const char *starts = s;
1623 int startinpos;
1624 int endinpos;
1625 int outpos;
1626 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001627 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001628 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001630 char* message;
1631 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001632 PyObject *errorHandler = NULL;
1633 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001634
Guido van Rossumd57fd912000-03-10 22:53:23 +00001635 /* Escaped strings will always be longer than the resulting
1636 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001637 length after conversion to the true value.
1638 (but if the error callback returns a long replacement string
1639 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001640 v = _PyUnicode_New(size);
1641 if (v == NULL)
1642 goto onError;
1643 if (size == 0)
1644 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001645
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001646 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001648
Guido van Rossumd57fd912000-03-10 22:53:23 +00001649 while (s < end) {
1650 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001651 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001652 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001653
1654 /* Non-escape characters are interpreted as Unicode ordinals */
1655 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001656 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657 continue;
1658 }
1659
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001660 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 /* \ - Escapes */
1662 s++;
1663 switch (*s++) {
1664
1665 /* \x escapes */
1666 case '\n': break;
1667 case '\\': *p++ = '\\'; break;
1668 case '\'': *p++ = '\''; break;
1669 case '\"': *p++ = '\"'; break;
1670 case 'b': *p++ = '\b'; break;
1671 case 'f': *p++ = '\014'; break; /* FF */
1672 case 't': *p++ = '\t'; break;
1673 case 'n': *p++ = '\n'; break;
1674 case 'r': *p++ = '\r'; break;
1675 case 'v': *p++ = '\013'; break; /* VT */
1676 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1677
1678 /* \OOO (octal) escapes */
1679 case '0': case '1': case '2': case '3':
1680 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001681 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001682 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001683 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001685 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001687 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001688 break;
1689
Fredrik Lundhccc74732001-02-18 22:13:49 +00001690 /* hex escapes */
1691 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001693 digits = 2;
1694 message = "truncated \\xXX escape";
1695 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696
Fredrik Lundhccc74732001-02-18 22:13:49 +00001697 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001699 digits = 4;
1700 message = "truncated \\uXXXX escape";
1701 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702
Fredrik Lundhccc74732001-02-18 22:13:49 +00001703 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001704 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001705 digits = 8;
1706 message = "truncated \\UXXXXXXXX escape";
1707 hexescape:
1708 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001709 outpos = p-PyUnicode_AS_UNICODE(v);
1710 if (s+digits>end) {
1711 endinpos = size;
1712 if (unicode_decode_call_errorhandler(
1713 errors, &errorHandler,
1714 "unicodeescape", "end of string in escape sequence",
1715 starts, size, &startinpos, &endinpos, &exc, &s,
1716 (PyObject **)&v, &outpos, &p))
1717 goto onError;
1718 goto nextByte;
1719 }
1720 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001721 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001722 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001723 endinpos = (s+i+1)-starts;
1724 if (unicode_decode_call_errorhandler(
1725 errors, &errorHandler,
1726 "unicodeescape", message,
1727 starts, size, &startinpos, &endinpos, &exc, &s,
1728 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001729 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001730 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001731 }
1732 chr = (chr<<4) & ~0xF;
1733 if (c >= '0' && c <= '9')
1734 chr += c - '0';
1735 else if (c >= 'a' && c <= 'f')
1736 chr += 10 + c - 'a';
1737 else
1738 chr += 10 + c - 'A';
1739 }
1740 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001741 if (chr == 0xffffffff)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742 /* _decoding_error will have already written into the
1743 target buffer. */
1744 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001745 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001746 /* when we get here, chr is a 32-bit unicode character */
1747 if (chr <= 0xffff)
1748 /* UCS-2 character */
1749 *p++ = (Py_UNICODE) chr;
1750 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001751 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001752 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001753#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001754 *p++ = chr;
1755#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001756 chr -= 0x10000L;
1757 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001758 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001759#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001760 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001761 endinpos = s-starts;
1762 outpos = p-PyUnicode_AS_UNICODE(v);
1763 if (unicode_decode_call_errorhandler(
1764 errors, &errorHandler,
1765 "unicodeescape", "illegal Unicode character",
1766 starts, size, &startinpos, &endinpos, &exc, &s,
1767 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001768 goto onError;
1769 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001770 break;
1771
1772 /* \N{name} */
1773 case 'N':
1774 message = "malformed \\N character escape";
1775 if (ucnhash_CAPI == NULL) {
1776 /* load the unicode data module */
1777 PyObject *m, *v;
1778 m = PyImport_ImportModule("unicodedata");
1779 if (m == NULL)
1780 goto ucnhashError;
1781 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1782 Py_DECREF(m);
1783 if (v == NULL)
1784 goto ucnhashError;
1785 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1786 Py_DECREF(v);
1787 if (ucnhash_CAPI == NULL)
1788 goto ucnhashError;
1789 }
1790 if (*s == '{') {
1791 const char *start = s+1;
1792 /* look for the closing brace */
1793 while (*s != '}' && s < end)
1794 s++;
1795 if (s > start && s < end && *s == '}') {
1796 /* found a name. look it up in the unicode database */
1797 message = "unknown Unicode character name";
1798 s++;
1799 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1800 goto store;
1801 }
1802 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 endinpos = s-starts;
1804 outpos = p-PyUnicode_AS_UNICODE(v);
1805 if (unicode_decode_call_errorhandler(
1806 errors, &errorHandler,
1807 "unicodeescape", message,
1808 starts, size, &startinpos, &endinpos, &exc, &s,
1809 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001810 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001811 break;
1812
1813 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001814 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001815 message = "\\ at end of string";
1816 s--;
1817 endinpos = s-starts;
1818 outpos = p-PyUnicode_AS_UNICODE(v);
1819 if (unicode_decode_call_errorhandler(
1820 errors, &errorHandler,
1821 "unicodeescape", message,
1822 starts, size, &startinpos, &endinpos, &exc, &s,
1823 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001824 goto onError;
1825 }
1826 else {
1827 *p++ = '\\';
1828 *p++ = (unsigned char)s[-1];
1829 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001830 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001831 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001832 nextByte:
1833 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001834 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001835 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
1836 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001837 Py_XDECREF(errorHandler);
1838 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001840
Fredrik Lundhccc74732001-02-18 22:13:49 +00001841ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001842 PyErr_SetString(
1843 PyExc_UnicodeError,
1844 "\\N escapes not supported (can't load unicodedata module)"
1845 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001846 Py_XDECREF(errorHandler);
1847 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001848 return NULL;
1849
Fredrik Lundhccc74732001-02-18 22:13:49 +00001850onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001852 Py_XDECREF(errorHandler);
1853 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854 return NULL;
1855}
1856
1857/* Return a Unicode-Escape string version of the Unicode object.
1858
1859 If quotes is true, the string is enclosed in u"" or u'' quotes as
1860 appropriate.
1861
1862*/
1863
Barry Warsaw51ac5802000-03-20 16:36:48 +00001864static const Py_UNICODE *findchar(const Py_UNICODE *s,
1865 int size,
1866 Py_UNICODE ch);
1867
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868static
1869PyObject *unicodeescape_string(const Py_UNICODE *s,
1870 int size,
1871 int quotes)
1872{
1873 PyObject *repr;
1874 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001876 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001877
1878 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1879 if (repr == NULL)
1880 return NULL;
1881
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001882 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883
1884 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001885 *p++ = 'u';
1886 *p++ = (findchar(s, size, '\'') &&
1887 !findchar(s, size, '"')) ? '"' : '\'';
1888 }
1889 while (size-- > 0) {
1890 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001891
Guido van Rossumd57fd912000-03-10 22:53:23 +00001892 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001893 if (quotes &&
1894 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895 *p++ = '\\';
1896 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001897 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001898 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001899
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001900#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001901 /* Map 21-bit characters to '\U00xxxxxx' */
1902 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001903 int offset = p - PyString_AS_STRING(repr);
1904
1905 /* Resize the string if necessary */
1906 if (offset + 12 > PyString_GET_SIZE(repr)) {
1907 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001908 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001909 p = PyString_AS_STRING(repr) + offset;
1910 }
1911
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912 *p++ = '\\';
1913 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001914 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1915 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1916 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1917 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1918 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1919 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1920 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001921 *p++ = hexdigit[ch & 0x0000000F];
1922 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001923 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001924#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001925 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1926 else if (ch >= 0xD800 && ch < 0xDC00) {
1927 Py_UNICODE ch2;
1928 Py_UCS4 ucs;
1929
1930 ch2 = *s++;
1931 size--;
1932 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1933 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1934 *p++ = '\\';
1935 *p++ = 'U';
1936 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1937 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1938 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1939 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1940 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1941 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1942 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1943 *p++ = hexdigit[ucs & 0x0000000F];
1944 continue;
1945 }
1946 /* Fall through: isolated surrogates are copied as-is */
1947 s--;
1948 size++;
1949 }
1950
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001952 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001953 *p++ = '\\';
1954 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001955 *p++ = hexdigit[(ch >> 12) & 0x000F];
1956 *p++ = hexdigit[(ch >> 8) & 0x000F];
1957 *p++ = hexdigit[(ch >> 4) & 0x000F];
1958 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001960
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001961 /* Map special whitespace to '\t', \n', '\r' */
1962 else if (ch == '\t') {
1963 *p++ = '\\';
1964 *p++ = 't';
1965 }
1966 else if (ch == '\n') {
1967 *p++ = '\\';
1968 *p++ = 'n';
1969 }
1970 else if (ch == '\r') {
1971 *p++ = '\\';
1972 *p++ = 'r';
1973 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001974
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001975 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001976 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001978 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001979 *p++ = hexdigit[(ch >> 4) & 0x000F];
1980 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001982
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983 /* Copy everything else as-is */
1984 else
1985 *p++ = (char) ch;
1986 }
1987 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001988 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989
1990 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00001991 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 return repr;
1993}
1994
1995PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1996 int size)
1997{
1998 return unicodeescape_string(s, size, 0);
1999}
2000
2001PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2002{
2003 if (!PyUnicode_Check(unicode)) {
2004 PyErr_BadArgument();
2005 return NULL;
2006 }
2007 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2008 PyUnicode_GET_SIZE(unicode));
2009}
2010
2011/* --- Raw Unicode Escape Codec ------------------------------------------- */
2012
2013PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2014 int size,
2015 const char *errors)
2016{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002017 const char *starts = s;
2018 int startinpos;
2019 int endinpos;
2020 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002022 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002023 const char *end;
2024 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002025 PyObject *errorHandler = NULL;
2026 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002027
2028 /* Escaped strings will always be longer than the resulting
2029 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002030 length after conversion to the true value. (But decoding error
2031 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002032 v = _PyUnicode_New(size);
2033 if (v == NULL)
2034 goto onError;
2035 if (size == 0)
2036 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002037 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038 end = s + size;
2039 while (s < end) {
2040 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002041 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002043 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044
2045 /* Non-escape characters are interpreted as Unicode ordinals */
2046 if (*s != '\\') {
2047 *p++ = (unsigned char)*s++;
2048 continue;
2049 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002050 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051
2052 /* \u-escapes are only interpreted iff the number of leading
2053 backslashes if odd */
2054 bs = s;
2055 for (;s < end;) {
2056 if (*s != '\\')
2057 break;
2058 *p++ = (unsigned char)*s++;
2059 }
2060 if (((s - bs) & 1) == 0 ||
2061 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002062 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063 continue;
2064 }
2065 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002066 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067 s++;
2068
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002069 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002070 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002071 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002072 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002074 endinpos = s-starts;
2075 if (unicode_decode_call_errorhandler(
2076 errors, &errorHandler,
2077 "rawunicodeescape", "truncated \\uXXXX",
2078 starts, size, &startinpos, &endinpos, &exc, &s,
2079 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002081 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082 }
2083 x = (x<<4) & ~0xF;
2084 if (c >= '0' && c <= '9')
2085 x += c - '0';
2086 else if (c >= 'a' && c <= 'f')
2087 x += 10 + c - 'a';
2088 else
2089 x += 10 + c - 'A';
2090 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002091#ifndef Py_UNICODE_WIDE
2092 if (x > 0x10000) {
2093 if (unicode_decode_call_errorhandler(
2094 errors, &errorHandler,
2095 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2096 starts, size, &startinpos, &endinpos, &exc, &s,
2097 (PyObject **)&v, &outpos, &p))
2098 goto onError;
2099 }
2100#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002101 *p++ = x;
2102 nextByte:
2103 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002105 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002106 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002107 Py_XDECREF(errorHandler);
2108 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109 return (PyObject *)v;
2110
2111 onError:
2112 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002113 Py_XDECREF(errorHandler);
2114 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002115 return NULL;
2116}
2117
2118PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2119 int size)
2120{
2121 PyObject *repr;
2122 char *p;
2123 char *q;
2124
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002125 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002127#ifdef Py_UNICODE_WIDE
2128 repr = PyString_FromStringAndSize(NULL, 10 * size);
2129#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002131#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002132 if (repr == NULL)
2133 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002134 if (size == 0)
2135 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136
2137 p = q = PyString_AS_STRING(repr);
2138 while (size-- > 0) {
2139 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002140#ifdef Py_UNICODE_WIDE
2141 /* Map 32-bit characters to '\Uxxxxxxxx' */
2142 if (ch >= 0x10000) {
2143 *p++ = '\\';
2144 *p++ = 'U';
2145 *p++ = hexdigit[(ch >> 28) & 0xf];
2146 *p++ = hexdigit[(ch >> 24) & 0xf];
2147 *p++ = hexdigit[(ch >> 20) & 0xf];
2148 *p++ = hexdigit[(ch >> 16) & 0xf];
2149 *p++ = hexdigit[(ch >> 12) & 0xf];
2150 *p++ = hexdigit[(ch >> 8) & 0xf];
2151 *p++ = hexdigit[(ch >> 4) & 0xf];
2152 *p++ = hexdigit[ch & 15];
2153 }
2154 else
2155#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156 /* Map 16-bit characters to '\uxxxx' */
2157 if (ch >= 256) {
2158 *p++ = '\\';
2159 *p++ = 'u';
2160 *p++ = hexdigit[(ch >> 12) & 0xf];
2161 *p++ = hexdigit[(ch >> 8) & 0xf];
2162 *p++ = hexdigit[(ch >> 4) & 0xf];
2163 *p++ = hexdigit[ch & 15];
2164 }
2165 /* Copy everything else as-is */
2166 else
2167 *p++ = (char) ch;
2168 }
2169 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002170 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002171 return repr;
2172}
2173
2174PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2175{
2176 if (!PyUnicode_Check(unicode)) {
2177 PyErr_BadArgument();
2178 return NULL;
2179 }
2180 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2181 PyUnicode_GET_SIZE(unicode));
2182}
2183
2184/* --- Latin-1 Codec ------------------------------------------------------ */
2185
2186PyObject *PyUnicode_DecodeLatin1(const char *s,
2187 int size,
2188 const char *errors)
2189{
2190 PyUnicodeObject *v;
2191 Py_UNICODE *p;
2192
2193 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002194 if (size == 1 && *(unsigned char*)s < 256) {
2195 Py_UNICODE r = *(unsigned char*)s;
2196 return PyUnicode_FromUnicode(&r, 1);
2197 }
2198
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199 v = _PyUnicode_New(size);
2200 if (v == NULL)
2201 goto onError;
2202 if (size == 0)
2203 return (PyObject *)v;
2204 p = PyUnicode_AS_UNICODE(v);
2205 while (size-- > 0)
2206 *p++ = (unsigned char)*s++;
2207 return (PyObject *)v;
2208
2209 onError:
2210 Py_XDECREF(v);
2211 return NULL;
2212}
2213
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002214/* create or adjust a UnicodeEncodeError */
2215static void make_encode_exception(PyObject **exceptionObject,
2216 const char *encoding,
2217 const Py_UNICODE *unicode, int size,
2218 int startpos, int endpos,
2219 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002220{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002221 if (*exceptionObject == NULL) {
2222 *exceptionObject = PyUnicodeEncodeError_Create(
2223 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224 }
2225 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002226 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2227 goto onError;
2228 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2229 goto onError;
2230 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2231 goto onError;
2232 return;
2233 onError:
2234 Py_DECREF(*exceptionObject);
2235 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236 }
2237}
2238
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002239/* raises a UnicodeEncodeError */
2240static void raise_encode_exception(PyObject **exceptionObject,
2241 const char *encoding,
2242 const Py_UNICODE *unicode, int size,
2243 int startpos, int endpos,
2244 const char *reason)
2245{
2246 make_encode_exception(exceptionObject,
2247 encoding, unicode, size, startpos, endpos, reason);
2248 if (*exceptionObject != NULL)
2249 PyCodec_StrictErrors(*exceptionObject);
2250}
2251
2252/* error handling callback helper:
2253 build arguments, call the callback and check the arguments,
2254 put the result into newpos and return the replacement string, which
2255 has to be freed by the caller */
2256static PyObject *unicode_encode_call_errorhandler(const char *errors,
2257 PyObject **errorHandler,
2258 const char *encoding, const char *reason,
2259 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2260 int startpos, int endpos,
2261 int *newpos)
2262{
2263 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2264
2265 PyObject *restuple;
2266 PyObject *resunicode;
2267
2268 if (*errorHandler == NULL) {
2269 *errorHandler = PyCodec_LookupError(errors);
2270 if (*errorHandler == NULL)
2271 return NULL;
2272 }
2273
2274 make_encode_exception(exceptionObject,
2275 encoding, unicode, size, startpos, endpos, reason);
2276 if (*exceptionObject == NULL)
2277 return NULL;
2278
2279 restuple = PyObject_CallFunctionObjArgs(
2280 *errorHandler, *exceptionObject, NULL);
2281 if (restuple == NULL)
2282 return NULL;
2283 if (!PyTuple_Check(restuple)) {
2284 PyErr_Format(PyExc_TypeError, &argparse[4]);
2285 Py_DECREF(restuple);
2286 return NULL;
2287 }
2288 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2289 &resunicode, newpos)) {
2290 Py_DECREF(restuple);
2291 return NULL;
2292 }
2293 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002294 *newpos = size+*newpos;
2295 if (*newpos<0 || *newpos>size) {
2296 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2297 Py_DECREF(restuple);
2298 return NULL;
2299 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002300 Py_INCREF(resunicode);
2301 Py_DECREF(restuple);
2302 return resunicode;
2303}
2304
2305static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2306 int size,
2307 const char *errors,
2308 int limit)
2309{
2310 /* output object */
2311 PyObject *res;
2312 /* pointers to the beginning and end+1 of input */
2313 const Py_UNICODE *startp = p;
2314 const Py_UNICODE *endp = p + size;
2315 /* pointer to the beginning of the unencodable characters */
2316 /* const Py_UNICODE *badp = NULL; */
2317 /* pointer into the output */
2318 char *str;
2319 /* current output position */
2320 int respos = 0;
2321 int ressize;
2322 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2323 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2324 PyObject *errorHandler = NULL;
2325 PyObject *exc = NULL;
2326 /* the following variable is used for caching string comparisons
2327 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2328 int known_errorHandler = -1;
2329
2330 /* allocate enough for a simple encoding without
2331 replacements, if we need more, we'll resize */
2332 res = PyString_FromStringAndSize(NULL, size);
2333 if (res == NULL)
2334 goto onError;
2335 if (size == 0)
2336 return res;
2337 str = PyString_AS_STRING(res);
2338 ressize = size;
2339
2340 while (p<endp) {
2341 Py_UNICODE c = *p;
2342
2343 /* can we encode this? */
2344 if (c<limit) {
2345 /* no overflow check, because we know that the space is enough */
2346 *str++ = (char)c;
2347 ++p;
2348 }
2349 else {
2350 int unicodepos = p-startp;
2351 int requiredsize;
2352 PyObject *repunicode;
2353 int repsize;
2354 int newpos;
2355 int respos;
2356 Py_UNICODE *uni2;
2357 /* startpos for collecting unencodable chars */
2358 const Py_UNICODE *collstart = p;
2359 const Py_UNICODE *collend = p;
2360 /* find all unecodable characters */
2361 while ((collend < endp) && ((*collend)>=limit))
2362 ++collend;
2363 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2364 if (known_errorHandler==-1) {
2365 if ((errors==NULL) || (!strcmp(errors, "strict")))
2366 known_errorHandler = 1;
2367 else if (!strcmp(errors, "replace"))
2368 known_errorHandler = 2;
2369 else if (!strcmp(errors, "ignore"))
2370 known_errorHandler = 3;
2371 else if (!strcmp(errors, "xmlcharrefreplace"))
2372 known_errorHandler = 4;
2373 else
2374 known_errorHandler = 0;
2375 }
2376 switch (known_errorHandler) {
2377 case 1: /* strict */
2378 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2379 goto onError;
2380 case 2: /* replace */
2381 while (collstart++<collend)
2382 *str++ = '?'; /* fall through */
2383 case 3: /* ignore */
2384 p = collend;
2385 break;
2386 case 4: /* xmlcharrefreplace */
2387 respos = str-PyString_AS_STRING(res);
2388 /* determine replacement size (temporarily (mis)uses p) */
2389 for (p = collstart, repsize = 0; p < collend; ++p) {
2390 if (*p<10)
2391 repsize += 2+1+1;
2392 else if (*p<100)
2393 repsize += 2+2+1;
2394 else if (*p<1000)
2395 repsize += 2+3+1;
2396 else if (*p<10000)
2397 repsize += 2+4+1;
2398 else if (*p<100000)
2399 repsize += 2+5+1;
2400 else if (*p<1000000)
2401 repsize += 2+6+1;
2402 else
2403 repsize += 2+7+1;
2404 }
2405 requiredsize = respos+repsize+(endp-collend);
2406 if (requiredsize > ressize) {
2407 if (requiredsize<2*ressize)
2408 requiredsize = 2*ressize;
2409 if (_PyString_Resize(&res, requiredsize))
2410 goto onError;
2411 str = PyString_AS_STRING(res) + respos;
2412 ressize = requiredsize;
2413 }
2414 /* generate replacement (temporarily (mis)uses p) */
2415 for (p = collstart; p < collend; ++p) {
2416 str += sprintf(str, "&#%d;", (int)*p);
2417 }
2418 p = collend;
2419 break;
2420 default:
2421 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2422 encoding, reason, startp, size, &exc,
2423 collstart-startp, collend-startp, &newpos);
2424 if (repunicode == NULL)
2425 goto onError;
2426 /* need more space? (at least enough for what we
2427 have+the replacement+the rest of the string, so
2428 we won't have to check space for encodable characters) */
2429 respos = str-PyString_AS_STRING(res);
2430 repsize = PyUnicode_GET_SIZE(repunicode);
2431 requiredsize = respos+repsize+(endp-collend);
2432 if (requiredsize > ressize) {
2433 if (requiredsize<2*ressize)
2434 requiredsize = 2*ressize;
2435 if (_PyString_Resize(&res, requiredsize)) {
2436 Py_DECREF(repunicode);
2437 goto onError;
2438 }
2439 str = PyString_AS_STRING(res) + respos;
2440 ressize = requiredsize;
2441 }
2442 /* check if there is anything unencodable in the replacement
2443 and copy it to the output */
2444 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2445 c = *uni2;
2446 if (c >= limit) {
2447 raise_encode_exception(&exc, encoding, startp, size,
2448 unicodepos, unicodepos+1, reason);
2449 Py_DECREF(repunicode);
2450 goto onError;
2451 }
2452 *str = (char)c;
2453 }
2454 p = startp + newpos;
2455 Py_DECREF(repunicode);
2456 }
2457 }
2458 }
2459 /* Resize if we allocated to much */
2460 respos = str-PyString_AS_STRING(res);
2461 if (respos<ressize)
2462 /* If this falls res will be NULL */
2463 _PyString_Resize(&res, respos);
2464 Py_XDECREF(errorHandler);
2465 Py_XDECREF(exc);
2466 return res;
2467
2468 onError:
2469 Py_XDECREF(res);
2470 Py_XDECREF(errorHandler);
2471 Py_XDECREF(exc);
2472 return NULL;
2473}
2474
Guido van Rossumd57fd912000-03-10 22:53:23 +00002475PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2476 int size,
2477 const char *errors)
2478{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002479 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480}
2481
2482PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2483{
2484 if (!PyUnicode_Check(unicode)) {
2485 PyErr_BadArgument();
2486 return NULL;
2487 }
2488 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2489 PyUnicode_GET_SIZE(unicode),
2490 NULL);
2491}
2492
2493/* --- 7-bit ASCII Codec -------------------------------------------------- */
2494
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495PyObject *PyUnicode_DecodeASCII(const char *s,
2496 int size,
2497 const char *errors)
2498{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002499 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500 PyUnicodeObject *v;
2501 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002502 int startinpos;
2503 int endinpos;
2504 int outpos;
2505 const char *e;
2506 PyObject *errorHandler = NULL;
2507 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508
2509 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002510 if (size == 1 && *(unsigned char*)s < 128) {
2511 Py_UNICODE r = *(unsigned char*)s;
2512 return PyUnicode_FromUnicode(&r, 1);
2513 }
2514
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515 v = _PyUnicode_New(size);
2516 if (v == NULL)
2517 goto onError;
2518 if (size == 0)
2519 return (PyObject *)v;
2520 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002521 e = s + size;
2522 while (s < e) {
2523 register unsigned char c = (unsigned char)*s;
2524 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002526 ++s;
2527 }
2528 else {
2529 startinpos = s-starts;
2530 endinpos = startinpos + 1;
2531 outpos = p-PyUnicode_AS_UNICODE(v);
2532 if (unicode_decode_call_errorhandler(
2533 errors, &errorHandler,
2534 "ascii", "ordinal not in range(128)",
2535 starts, size, &startinpos, &endinpos, &exc, &s,
2536 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002540 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002541 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002542 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002543 Py_XDECREF(errorHandler);
2544 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545 return (PyObject *)v;
2546
2547 onError:
2548 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002549 Py_XDECREF(errorHandler);
2550 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551 return NULL;
2552}
2553
Guido van Rossumd57fd912000-03-10 22:53:23 +00002554PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2555 int size,
2556 const char *errors)
2557{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002558 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559}
2560
2561PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2562{
2563 if (!PyUnicode_Check(unicode)) {
2564 PyErr_BadArgument();
2565 return NULL;
2566 }
2567 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2568 PyUnicode_GET_SIZE(unicode),
2569 NULL);
2570}
2571
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002572#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002573
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002574/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002575
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002576PyObject *PyUnicode_DecodeMBCS(const char *s,
2577 int size,
2578 const char *errors)
2579{
2580 PyUnicodeObject *v;
2581 Py_UNICODE *p;
2582
2583 /* First get the size of the result */
2584 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002585 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002586 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2587
2588 v = _PyUnicode_New(usize);
2589 if (v == NULL)
2590 return NULL;
2591 if (usize == 0)
2592 return (PyObject *)v;
2593 p = PyUnicode_AS_UNICODE(v);
2594 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2595 Py_DECREF(v);
2596 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2597 }
2598
2599 return (PyObject *)v;
2600}
2601
2602PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2603 int size,
2604 const char *errors)
2605{
2606 PyObject *repr;
2607 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002608 DWORD mbcssize;
2609
2610 /* If there are no characters, bail now! */
2611 if (size==0)
2612 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002613
2614 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002615 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002616 if (mbcssize==0)
2617 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2618
2619 repr = PyString_FromStringAndSize(NULL, mbcssize);
2620 if (repr == NULL)
2621 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002622 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002623 return repr;
2624
2625 /* Do the conversion */
2626 s = PyString_AS_STRING(repr);
2627 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2628 Py_DECREF(repr);
2629 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2630 }
2631 return repr;
2632}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002633
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002634PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2635{
2636 if (!PyUnicode_Check(unicode)) {
2637 PyErr_BadArgument();
2638 return NULL;
2639 }
2640 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2641 PyUnicode_GET_SIZE(unicode),
2642 NULL);
2643}
2644
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002645#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002646
Guido van Rossumd57fd912000-03-10 22:53:23 +00002647/* --- Character Mapping Codec -------------------------------------------- */
2648
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649PyObject *PyUnicode_DecodeCharmap(const char *s,
2650 int size,
2651 PyObject *mapping,
2652 const char *errors)
2653{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002654 const char *starts = s;
2655 int startinpos;
2656 int endinpos;
2657 int outpos;
2658 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659 PyUnicodeObject *v;
2660 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002661 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002662 PyObject *errorHandler = NULL;
2663 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664
2665 /* Default to Latin-1 */
2666 if (mapping == NULL)
2667 return PyUnicode_DecodeLatin1(s, size, errors);
2668
2669 v = _PyUnicode_New(size);
2670 if (v == NULL)
2671 goto onError;
2672 if (size == 0)
2673 return (PyObject *)v;
2674 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002675 e = s + size;
2676 while (s < e) {
2677 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678 PyObject *w, *x;
2679
2680 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2681 w = PyInt_FromLong((long)ch);
2682 if (w == NULL)
2683 goto onError;
2684 x = PyObject_GetItem(mapping, w);
2685 Py_DECREF(w);
2686 if (x == NULL) {
2687 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002688 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002689 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002690 x = Py_None;
2691 Py_INCREF(x);
2692 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002693 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 }
2695
2696 /* Apply mapping */
2697 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002698 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 if (value < 0 || value > 65535) {
2700 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002701 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702 Py_DECREF(x);
2703 goto onError;
2704 }
2705 *p++ = (Py_UNICODE)value;
2706 }
2707 else if (x == Py_None) {
2708 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002709 outpos = p-PyUnicode_AS_UNICODE(v);
2710 startinpos = s-starts;
2711 endinpos = startinpos+1;
2712 if (unicode_decode_call_errorhandler(
2713 errors, &errorHandler,
2714 "charmap", "character maps to <undefined>",
2715 starts, size, &startinpos, &endinpos, &exc, &s,
2716 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002717 Py_DECREF(x);
2718 goto onError;
2719 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002720 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721 }
2722 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002723 int targetsize = PyUnicode_GET_SIZE(x);
2724
2725 if (targetsize == 1)
2726 /* 1-1 mapping */
2727 *p++ = *PyUnicode_AS_UNICODE(x);
2728
2729 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002731 if (targetsize > extrachars) {
2732 /* resize first */
2733 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2734 int needed = (targetsize - extrachars) + \
2735 (targetsize << 2);
2736 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002737 if (_PyUnicode_Resize(&v,
2738 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002739 Py_DECREF(x);
2740 goto onError;
2741 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002742 p = PyUnicode_AS_UNICODE(v) + oldpos;
2743 }
2744 Py_UNICODE_COPY(p,
2745 PyUnicode_AS_UNICODE(x),
2746 targetsize);
2747 p += targetsize;
2748 extrachars -= targetsize;
2749 }
2750 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 }
2752 else {
2753 /* wrong return value */
2754 PyErr_SetString(PyExc_TypeError,
2755 "character mapping must return integer, None or unicode");
2756 Py_DECREF(x);
2757 goto onError;
2758 }
2759 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002760 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 }
2762 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002763 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002765 Py_XDECREF(errorHandler);
2766 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767 return (PyObject *)v;
2768
2769 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002770 Py_XDECREF(errorHandler);
2771 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772 Py_XDECREF(v);
2773 return NULL;
2774}
2775
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002776/* Lookup the character ch in the mapping. If the character
2777 can't be found, Py_None is returned (or NULL, if another
2778 error occured). */
2779static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002781 PyObject *w = PyInt_FromLong((long)c);
2782 PyObject *x;
2783
2784 if (w == NULL)
2785 return NULL;
2786 x = PyObject_GetItem(mapping, w);
2787 Py_DECREF(w);
2788 if (x == NULL) {
2789 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2790 /* No mapping found means: mapping is undefined. */
2791 PyErr_Clear();
2792 x = Py_None;
2793 Py_INCREF(x);
2794 return x;
2795 } else
2796 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002798 else if (x == Py_None)
2799 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002800 else if (PyInt_Check(x)) {
2801 long value = PyInt_AS_LONG(x);
2802 if (value < 0 || value > 255) {
2803 PyErr_SetString(PyExc_TypeError,
2804 "character mapping must be in range(256)");
2805 Py_DECREF(x);
2806 return NULL;
2807 }
2808 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002810 else if (PyString_Check(x))
2811 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002813 /* wrong return value */
2814 PyErr_SetString(PyExc_TypeError,
2815 "character mapping must return integer, None or str");
2816 Py_DECREF(x);
2817 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818 }
2819}
2820
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002821/* lookup the character, put the result in the output string and adjust
2822 various state variables. Reallocate the output string if not enough
2823 space is available. Return a new reference to the object that
2824 was put in the output buffer, or Py_None, if the mapping was undefined
2825 (in which case no character was written) or NULL, if a
2826 reallocation error ocurred. The called must decref the result */
2827static
2828PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2829 PyObject **outobj, int *outpos)
2830{
2831 PyObject *rep = charmapencode_lookup(c, mapping);
2832
2833 if (rep==NULL)
2834 return NULL;
2835 else if (rep==Py_None)
2836 return rep;
2837 else {
2838 char *outstart = PyString_AS_STRING(*outobj);
2839 int outsize = PyString_GET_SIZE(*outobj);
2840 if (PyInt_Check(rep)) {
2841 int requiredsize = *outpos+1;
2842 if (outsize<requiredsize) {
2843 /* exponentially overallocate to minimize reallocations */
2844 if (requiredsize < 2*outsize)
2845 requiredsize = 2*outsize;
2846 if (_PyString_Resize(outobj, requiredsize)) {
2847 Py_DECREF(rep);
2848 return NULL;
2849 }
2850 outstart = PyString_AS_STRING(*outobj);
2851 }
2852 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2853 }
2854 else {
2855 const char *repchars = PyString_AS_STRING(rep);
2856 int repsize = PyString_GET_SIZE(rep);
2857 int requiredsize = *outpos+repsize;
2858 if (outsize<requiredsize) {
2859 /* exponentially overallocate to minimize reallocations */
2860 if (requiredsize < 2*outsize)
2861 requiredsize = 2*outsize;
2862 if (_PyString_Resize(outobj, requiredsize)) {
2863 Py_DECREF(rep);
2864 return NULL;
2865 }
2866 outstart = PyString_AS_STRING(*outobj);
2867 }
2868 memcpy(outstart + *outpos, repchars, repsize);
2869 *outpos += repsize;
2870 }
2871 }
2872 return rep;
2873}
2874
2875/* handle an error in PyUnicode_EncodeCharmap
2876 Return 0 on success, -1 on error */
2877static
2878int charmap_encoding_error(
2879 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2880 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002881 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002882 PyObject **res, int *respos)
2883{
2884 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2885 int repsize;
2886 int newpos;
2887 Py_UNICODE *uni2;
2888 /* startpos for collecting unencodable chars */
2889 int collstartpos = *inpos;
2890 int collendpos = *inpos+1;
2891 int collpos;
2892 char *encoding = "charmap";
2893 char *reason = "character maps to <undefined>";
2894
2895 PyObject *x;
2896 /* find all unencodable characters */
2897 while (collendpos < size) {
2898 x = charmapencode_lookup(p[collendpos], mapping);
2899 if (x==NULL)
2900 return -1;
2901 else if (x!=Py_None) {
2902 Py_DECREF(x);
2903 break;
2904 }
2905 Py_DECREF(x);
2906 ++collendpos;
2907 }
2908 /* cache callback name lookup
2909 * (if not done yet, i.e. it's the first error) */
2910 if (*known_errorHandler==-1) {
2911 if ((errors==NULL) || (!strcmp(errors, "strict")))
2912 *known_errorHandler = 1;
2913 else if (!strcmp(errors, "replace"))
2914 *known_errorHandler = 2;
2915 else if (!strcmp(errors, "ignore"))
2916 *known_errorHandler = 3;
2917 else if (!strcmp(errors, "xmlcharrefreplace"))
2918 *known_errorHandler = 4;
2919 else
2920 *known_errorHandler = 0;
2921 }
2922 switch (*known_errorHandler) {
2923 case 1: /* strict */
2924 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2925 return -1;
2926 case 2: /* replace */
2927 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2928 x = charmapencode_output('?', mapping, res, respos);
2929 if (x==NULL) {
2930 return -1;
2931 }
2932 else if (x==Py_None) {
2933 Py_DECREF(x);
2934 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2935 return -1;
2936 }
2937 Py_DECREF(x);
2938 }
2939 /* fall through */
2940 case 3: /* ignore */
2941 *inpos = collendpos;
2942 break;
2943 case 4: /* xmlcharrefreplace */
2944 /* generate replacement (temporarily (mis)uses p) */
2945 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2946 char buffer[2+29+1+1];
2947 char *cp;
2948 sprintf(buffer, "&#%d;", (int)p[collpos]);
2949 for (cp = buffer; *cp; ++cp) {
2950 x = charmapencode_output(*cp, mapping, res, respos);
2951 if (x==NULL)
2952 return -1;
2953 else if (x==Py_None) {
2954 Py_DECREF(x);
2955 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2956 return -1;
2957 }
2958 Py_DECREF(x);
2959 }
2960 }
2961 *inpos = collendpos;
2962 break;
2963 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002964 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002965 encoding, reason, p, size, exceptionObject,
2966 collstartpos, collendpos, &newpos);
2967 if (repunicode == NULL)
2968 return -1;
2969 /* generate replacement */
2970 repsize = PyUnicode_GET_SIZE(repunicode);
2971 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2972 x = charmapencode_output(*uni2, mapping, res, respos);
2973 if (x==NULL) {
2974 Py_DECREF(repunicode);
2975 return -1;
2976 }
2977 else if (x==Py_None) {
2978 Py_DECREF(repunicode);
2979 Py_DECREF(x);
2980 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2981 return -1;
2982 }
2983 Py_DECREF(x);
2984 }
2985 *inpos = newpos;
2986 Py_DECREF(repunicode);
2987 }
2988 return 0;
2989}
2990
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2992 int size,
2993 PyObject *mapping,
2994 const char *errors)
2995{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002996 /* output object */
2997 PyObject *res = NULL;
2998 /* current input position */
2999 int inpos = 0;
3000 /* current output position */
3001 int respos = 0;
3002 PyObject *errorHandler = NULL;
3003 PyObject *exc = NULL;
3004 /* the following variable is used for caching string comparisons
3005 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3006 * 3=ignore, 4=xmlcharrefreplace */
3007 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003008
3009 /* Default to Latin-1 */
3010 if (mapping == NULL)
3011 return PyUnicode_EncodeLatin1(p, size, errors);
3012
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003013 /* allocate enough for a simple encoding without
3014 replacements, if we need more, we'll resize */
3015 res = PyString_FromStringAndSize(NULL, size);
3016 if (res == NULL)
3017 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003018 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003019 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003021 while (inpos<size) {
3022 /* try to encode it */
3023 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3024 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003025 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003026 if (x==Py_None) { /* unencodable character */
3027 if (charmap_encoding_error(p, size, &inpos, mapping,
3028 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003029 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003030 &res, &respos)) {
3031 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003032 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003033 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003035 else
3036 /* done with this character => adjust input position */
3037 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 Py_DECREF(x);
3039 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003041 /* Resize if we allocated to much */
3042 if (respos<PyString_GET_SIZE(res)) {
3043 if (_PyString_Resize(&res, respos))
3044 goto onError;
3045 }
3046 Py_XDECREF(exc);
3047 Py_XDECREF(errorHandler);
3048 return res;
3049
3050 onError:
3051 Py_XDECREF(res);
3052 Py_XDECREF(exc);
3053 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054 return NULL;
3055}
3056
3057PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3058 PyObject *mapping)
3059{
3060 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3061 PyErr_BadArgument();
3062 return NULL;
3063 }
3064 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3065 PyUnicode_GET_SIZE(unicode),
3066 mapping,
3067 NULL);
3068}
3069
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070/* create or adjust a UnicodeTranslateError */
3071static void make_translate_exception(PyObject **exceptionObject,
3072 const Py_UNICODE *unicode, int size,
3073 int startpos, int endpos,
3074 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003076 if (*exceptionObject == NULL) {
3077 *exceptionObject = PyUnicodeTranslateError_Create(
3078 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079 }
3080 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003081 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3082 goto onError;
3083 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3084 goto onError;
3085 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3086 goto onError;
3087 return;
3088 onError:
3089 Py_DECREF(*exceptionObject);
3090 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 }
3092}
3093
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003094/* raises a UnicodeTranslateError */
3095static void raise_translate_exception(PyObject **exceptionObject,
3096 const Py_UNICODE *unicode, int size,
3097 int startpos, int endpos,
3098 const char *reason)
3099{
3100 make_translate_exception(exceptionObject,
3101 unicode, size, startpos, endpos, reason);
3102 if (*exceptionObject != NULL)
3103 PyCodec_StrictErrors(*exceptionObject);
3104}
3105
3106/* error handling callback helper:
3107 build arguments, call the callback and check the arguments,
3108 put the result into newpos and return the replacement string, which
3109 has to be freed by the caller */
3110static PyObject *unicode_translate_call_errorhandler(const char *errors,
3111 PyObject **errorHandler,
3112 const char *reason,
3113 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3114 int startpos, int endpos,
3115 int *newpos)
3116{
3117 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3118
3119 PyObject *restuple;
3120 PyObject *resunicode;
3121
3122 if (*errorHandler == NULL) {
3123 *errorHandler = PyCodec_LookupError(errors);
3124 if (*errorHandler == NULL)
3125 return NULL;
3126 }
3127
3128 make_translate_exception(exceptionObject,
3129 unicode, size, startpos, endpos, reason);
3130 if (*exceptionObject == NULL)
3131 return NULL;
3132
3133 restuple = PyObject_CallFunctionObjArgs(
3134 *errorHandler, *exceptionObject, NULL);
3135 if (restuple == NULL)
3136 return NULL;
3137 if (!PyTuple_Check(restuple)) {
3138 PyErr_Format(PyExc_TypeError, &argparse[4]);
3139 Py_DECREF(restuple);
3140 return NULL;
3141 }
3142 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3143 &resunicode, newpos)) {
3144 Py_DECREF(restuple);
3145 return NULL;
3146 }
3147 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003148 *newpos = size+*newpos;
3149 if (*newpos<0 || *newpos>size) {
3150 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3151 Py_DECREF(restuple);
3152 return NULL;
3153 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003154 Py_INCREF(resunicode);
3155 Py_DECREF(restuple);
3156 return resunicode;
3157}
3158
3159/* Lookup the character ch in the mapping and put the result in result,
3160 which must be decrefed by the caller.
3161 Return 0 on success, -1 on error */
3162static
3163int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3164{
3165 PyObject *w = PyInt_FromLong((long)c);
3166 PyObject *x;
3167
3168 if (w == NULL)
3169 return -1;
3170 x = PyObject_GetItem(mapping, w);
3171 Py_DECREF(w);
3172 if (x == NULL) {
3173 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3174 /* No mapping found means: use 1:1 mapping. */
3175 PyErr_Clear();
3176 *result = NULL;
3177 return 0;
3178 } else
3179 return -1;
3180 }
3181 else if (x == Py_None) {
3182 *result = x;
3183 return 0;
3184 }
3185 else if (PyInt_Check(x)) {
3186 long value = PyInt_AS_LONG(x);
3187 long max = PyUnicode_GetMax();
3188 if (value < 0 || value > max) {
3189 PyErr_Format(PyExc_TypeError,
3190 "character mapping must be in range(0x%lx)", max+1);
3191 Py_DECREF(x);
3192 return -1;
3193 }
3194 *result = x;
3195 return 0;
3196 }
3197 else if (PyUnicode_Check(x)) {
3198 *result = x;
3199 return 0;
3200 }
3201 else {
3202 /* wrong return value */
3203 PyErr_SetString(PyExc_TypeError,
3204 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003205 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003206 return -1;
3207 }
3208}
3209/* ensure that *outobj is at least requiredsize characters long,
3210if not reallocate and adjust various state variables.
3211Return 0 on success, -1 on error */
3212static
3213int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize,
3214 int requiredsize)
3215{
3216 if (requiredsize > *outsize) {
3217 /* remember old output position */
3218 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3219 /* exponentially overallocate to minimize reallocations */
3220 if (requiredsize < 2 * *outsize)
3221 requiredsize = 2 * *outsize;
3222 if (_PyUnicode_Resize(outobj, requiredsize))
3223 return -1;
3224 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3225 *outsize = requiredsize;
3226 }
3227 return 0;
3228}
3229/* lookup the character, put the result in the output string and adjust
3230 various state variables. Return a new reference to the object that
3231 was put in the output buffer in *result, or Py_None, if the mapping was
3232 undefined (in which case no character was written).
3233 The called must decref result.
3234 Return 0 on success, -1 on error. */
3235static
3236int charmaptranslate_output(Py_UNICODE c, PyObject *mapping,
3237 PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res)
3238{
3239 if (charmaptranslate_lookup(c, mapping, res))
3240 return -1;
3241 if (*res==NULL) {
3242 /* not found => default to 1:1 mapping */
3243 *(*outp)++ = (Py_UNICODE)c;
3244 }
3245 else if (*res==Py_None)
3246 ;
3247 else if (PyInt_Check(*res)) {
3248 /* no overflow check, because we know that the space is enough */
3249 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3250 }
3251 else if (PyUnicode_Check(*res)) {
3252 int repsize = PyUnicode_GET_SIZE(*res);
3253 if (repsize==1) {
3254 /* no overflow check, because we know that the space is enough */
3255 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3256 }
3257 else if (repsize!=0) {
3258 /* more than one character */
3259 int requiredsize = *outsize + repsize - 1;
3260 if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize))
3261 return -1;
3262 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3263 *outp += repsize;
3264 }
3265 }
3266 else
3267 return -1;
3268 return 0;
3269}
3270
3271PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272 int size,
3273 PyObject *mapping,
3274 const char *errors)
3275{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003276 /* output object */
3277 PyObject *res = NULL;
3278 /* pointers to the beginning and end+1 of input */
3279 const Py_UNICODE *startp = p;
3280 const Py_UNICODE *endp = p + size;
3281 /* pointer into the output */
3282 Py_UNICODE *str;
3283 /* current output position */
3284 int respos = 0;
3285 int ressize;
3286 char *reason = "character maps to <undefined>";
3287 PyObject *errorHandler = NULL;
3288 PyObject *exc = NULL;
3289 /* the following variable is used for caching string comparisons
3290 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3291 * 3=ignore, 4=xmlcharrefreplace */
3292 int known_errorHandler = -1;
3293
Guido van Rossumd57fd912000-03-10 22:53:23 +00003294 if (mapping == NULL) {
3295 PyErr_BadArgument();
3296 return NULL;
3297 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003298
3299 /* allocate enough for a simple 1:1 translation without
3300 replacements, if we need more, we'll resize */
3301 res = PyUnicode_FromUnicode(NULL, size);
3302 if (res == NULL)
3303 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003305 return res;
3306 str = PyUnicode_AS_UNICODE(res);
3307 ressize = size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003309 while (p<endp) {
3310 /* try to encode it */
3311 PyObject *x = NULL;
3312 if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) {
3313 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314 goto onError;
3315 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003316 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003317 if (x!=Py_None) /* it worked => adjust input pointer */
3318 ++p;
3319 else { /* untranslatable character */
3320 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3321 int repsize;
3322 int newpos;
3323 Py_UNICODE *uni2;
3324 /* startpos for collecting untranslatable chars */
3325 const Py_UNICODE *collstart = p;
3326 const Py_UNICODE *collend = p+1;
3327 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003328
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003329 /* find all untranslatable characters */
3330 while (collend < endp) {
3331 if (charmaptranslate_lookup(*collend, mapping, &x))
3332 goto onError;
3333 Py_XDECREF(x);
3334 if (x!=Py_None)
3335 break;
3336 ++collend;
3337 }
3338 /* cache callback name lookup
3339 * (if not done yet, i.e. it's the first error) */
3340 if (known_errorHandler==-1) {
3341 if ((errors==NULL) || (!strcmp(errors, "strict")))
3342 known_errorHandler = 1;
3343 else if (!strcmp(errors, "replace"))
3344 known_errorHandler = 2;
3345 else if (!strcmp(errors, "ignore"))
3346 known_errorHandler = 3;
3347 else if (!strcmp(errors, "xmlcharrefreplace"))
3348 known_errorHandler = 4;
3349 else
3350 known_errorHandler = 0;
3351 }
3352 switch (known_errorHandler) {
3353 case 1: /* strict */
3354 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3355 goto onError;
3356 case 2: /* replace */
3357 /* No need to check for space, this is a 1:1 replacement */
3358 for (coll = collstart; coll<collend; ++coll)
3359 *str++ = '?';
3360 /* fall through */
3361 case 3: /* ignore */
3362 p = collend;
3363 break;
3364 case 4: /* xmlcharrefreplace */
3365 /* generate replacement (temporarily (mis)uses p) */
3366 for (p = collstart; p < collend; ++p) {
3367 char buffer[2+29+1+1];
3368 char *cp;
3369 sprintf(buffer, "&#%d;", (int)*p);
3370 if (charmaptranslate_makespace(&res, &str, &ressize,
3371 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3372 goto onError;
3373 for (cp = buffer; *cp; ++cp)
3374 *str++ = *cp;
3375 }
3376 p = collend;
3377 break;
3378 default:
3379 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3380 reason, startp, size, &exc,
3381 collstart-startp, collend-startp, &newpos);
3382 if (repunicode == NULL)
3383 goto onError;
3384 /* generate replacement */
3385 repsize = PyUnicode_GET_SIZE(repunicode);
3386 if (charmaptranslate_makespace(&res, &str, &ressize,
3387 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3388 Py_DECREF(repunicode);
3389 goto onError;
3390 }
3391 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3392 *str++ = *uni2;
3393 p = startp + newpos;
3394 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003395 }
3396 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003398 /* Resize if we allocated to much */
3399 respos = str-PyUnicode_AS_UNICODE(res);
3400 if (respos<ressize) {
3401 if (_PyUnicode_Resize(&res, respos))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003402 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003403 }
3404 Py_XDECREF(exc);
3405 Py_XDECREF(errorHandler);
3406 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003408 onError:
3409 Py_XDECREF(res);
3410 Py_XDECREF(exc);
3411 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003412 return NULL;
3413}
3414
3415PyObject *PyUnicode_Translate(PyObject *str,
3416 PyObject *mapping,
3417 const char *errors)
3418{
3419 PyObject *result;
3420
3421 str = PyUnicode_FromObject(str);
3422 if (str == NULL)
3423 goto onError;
3424 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3425 PyUnicode_GET_SIZE(str),
3426 mapping,
3427 errors);
3428 Py_DECREF(str);
3429 return result;
3430
3431 onError:
3432 Py_XDECREF(str);
3433 return NULL;
3434}
3435
Guido van Rossum9e896b32000-04-05 20:11:21 +00003436/* --- Decimal Encoder ---------------------------------------------------- */
3437
3438int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3439 int length,
3440 char *output,
3441 const char *errors)
3442{
3443 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003444 PyObject *errorHandler = NULL;
3445 PyObject *exc = NULL;
3446 const char *encoding = "decimal";
3447 const char *reason = "invalid decimal Unicode string";
3448 /* the following variable is used for caching string comparisons
3449 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3450 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003451
3452 if (output == NULL) {
3453 PyErr_BadArgument();
3454 return -1;
3455 }
3456
3457 p = s;
3458 end = s + length;
3459 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003460 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003461 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003462 PyObject *repunicode;
3463 int repsize;
3464 int newpos;
3465 Py_UNICODE *uni2;
3466 Py_UNICODE *collstart;
3467 Py_UNICODE *collend;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003468
3469 if (Py_UNICODE_ISSPACE(ch)) {
3470 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003471 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003472 continue;
3473 }
3474 decimal = Py_UNICODE_TODECIMAL(ch);
3475 if (decimal >= 0) {
3476 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003477 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003478 continue;
3479 }
Guido van Rossumba477042000-04-06 18:18:10 +00003480 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003481 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003482 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003483 continue;
3484 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003485 /* All other characters are considered unencodable */
3486 collstart = p;
3487 collend = p+1;
3488 while (collend < end) {
3489 if ((0 < *collend && *collend < 256) ||
3490 !Py_UNICODE_ISSPACE(*collend) ||
3491 Py_UNICODE_TODECIMAL(*collend))
3492 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003493 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003494 /* cache callback name lookup
3495 * (if not done yet, i.e. it's the first error) */
3496 if (known_errorHandler==-1) {
3497 if ((errors==NULL) || (!strcmp(errors, "strict")))
3498 known_errorHandler = 1;
3499 else if (!strcmp(errors, "replace"))
3500 known_errorHandler = 2;
3501 else if (!strcmp(errors, "ignore"))
3502 known_errorHandler = 3;
3503 else if (!strcmp(errors, "xmlcharrefreplace"))
3504 known_errorHandler = 4;
3505 else
3506 known_errorHandler = 0;
3507 }
3508 switch (known_errorHandler) {
3509 case 1: /* strict */
3510 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3511 goto onError;
3512 case 2: /* replace */
3513 for (p = collstart; p < collend; ++p)
3514 *output++ = '?';
3515 /* fall through */
3516 case 3: /* ignore */
3517 p = collend;
3518 break;
3519 case 4: /* xmlcharrefreplace */
3520 /* generate replacement (temporarily (mis)uses p) */
3521 for (p = collstart; p < collend; ++p)
3522 output += sprintf(output, "&#%d;", (int)*p);
3523 p = collend;
3524 break;
3525 default:
3526 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3527 encoding, reason, s, length, &exc,
3528 collstart-s, collend-s, &newpos);
3529 if (repunicode == NULL)
3530 goto onError;
3531 /* generate replacement */
3532 repsize = PyUnicode_GET_SIZE(repunicode);
3533 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3534 Py_UNICODE ch = *uni2;
3535 if (Py_UNICODE_ISSPACE(ch))
3536 *output++ = ' ';
3537 else {
3538 decimal = Py_UNICODE_TODECIMAL(ch);
3539 if (decimal >= 0)
3540 *output++ = '0' + decimal;
3541 else if (0 < ch && ch < 256)
3542 *output++ = (char)ch;
3543 else {
3544 Py_DECREF(repunicode);
3545 raise_encode_exception(&exc, encoding,
3546 s, length, collstart-s, collend-s, reason);
3547 goto onError;
3548 }
3549 }
3550 }
3551 p = s + newpos;
3552 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003553 }
3554 }
3555 /* 0-terminate the output string */
3556 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 Py_XDECREF(exc);
3558 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003559 return 0;
3560
3561 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562 Py_XDECREF(exc);
3563 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003564 return -1;
3565}
3566
Guido van Rossumd57fd912000-03-10 22:53:23 +00003567/* --- Helpers ------------------------------------------------------------ */
3568
3569static
3570int count(PyUnicodeObject *self,
3571 int start,
3572 int end,
3573 PyUnicodeObject *substring)
3574{
3575 int count = 0;
3576
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003577 if (start < 0)
3578 start += self->length;
3579 if (start < 0)
3580 start = 0;
3581 if (end > self->length)
3582 end = self->length;
3583 if (end < 0)
3584 end += self->length;
3585 if (end < 0)
3586 end = 0;
3587
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003588 if (substring->length == 0)
3589 return (end - start + 1);
3590
Guido van Rossumd57fd912000-03-10 22:53:23 +00003591 end -= substring->length;
3592
3593 while (start <= end)
3594 if (Py_UNICODE_MATCH(self, start, substring)) {
3595 count++;
3596 start += substring->length;
3597 } else
3598 start++;
3599
3600 return count;
3601}
3602
3603int PyUnicode_Count(PyObject *str,
3604 PyObject *substr,
3605 int start,
3606 int end)
3607{
3608 int result;
3609
3610 str = PyUnicode_FromObject(str);
3611 if (str == NULL)
3612 return -1;
3613 substr = PyUnicode_FromObject(substr);
3614 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003615 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616 return -1;
3617 }
3618
3619 result = count((PyUnicodeObject *)str,
3620 start, end,
3621 (PyUnicodeObject *)substr);
3622
3623 Py_DECREF(str);
3624 Py_DECREF(substr);
3625 return result;
3626}
3627
3628static
3629int findstring(PyUnicodeObject *self,
3630 PyUnicodeObject *substring,
3631 int start,
3632 int end,
3633 int direction)
3634{
3635 if (start < 0)
3636 start += self->length;
3637 if (start < 0)
3638 start = 0;
3639
Guido van Rossumd57fd912000-03-10 22:53:23 +00003640 if (end > self->length)
3641 end = self->length;
3642 if (end < 0)
3643 end += self->length;
3644 if (end < 0)
3645 end = 0;
3646
Guido van Rossum76afbd92002-08-20 17:29:29 +00003647 if (substring->length == 0)
3648 return (direction > 0) ? start : end;
3649
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650 end -= substring->length;
3651
3652 if (direction < 0) {
3653 for (; end >= start; end--)
3654 if (Py_UNICODE_MATCH(self, end, substring))
3655 return end;
3656 } else {
3657 for (; start <= end; start++)
3658 if (Py_UNICODE_MATCH(self, start, substring))
3659 return start;
3660 }
3661
3662 return -1;
3663}
3664
3665int PyUnicode_Find(PyObject *str,
3666 PyObject *substr,
3667 int start,
3668 int end,
3669 int direction)
3670{
3671 int result;
3672
3673 str = PyUnicode_FromObject(str);
3674 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003675 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003676 substr = PyUnicode_FromObject(substr);
3677 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003678 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003679 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680 }
3681
3682 result = findstring((PyUnicodeObject *)str,
3683 (PyUnicodeObject *)substr,
3684 start, end, direction);
3685 Py_DECREF(str);
3686 Py_DECREF(substr);
3687 return result;
3688}
3689
3690static
3691int tailmatch(PyUnicodeObject *self,
3692 PyUnicodeObject *substring,
3693 int start,
3694 int end,
3695 int direction)
3696{
3697 if (start < 0)
3698 start += self->length;
3699 if (start < 0)
3700 start = 0;
3701
3702 if (substring->length == 0)
3703 return 1;
3704
3705 if (end > self->length)
3706 end = self->length;
3707 if (end < 0)
3708 end += self->length;
3709 if (end < 0)
3710 end = 0;
3711
3712 end -= substring->length;
3713 if (end < start)
3714 return 0;
3715
3716 if (direction > 0) {
3717 if (Py_UNICODE_MATCH(self, end, substring))
3718 return 1;
3719 } else {
3720 if (Py_UNICODE_MATCH(self, start, substring))
3721 return 1;
3722 }
3723
3724 return 0;
3725}
3726
3727int PyUnicode_Tailmatch(PyObject *str,
3728 PyObject *substr,
3729 int start,
3730 int end,
3731 int direction)
3732{
3733 int result;
3734
3735 str = PyUnicode_FromObject(str);
3736 if (str == NULL)
3737 return -1;
3738 substr = PyUnicode_FromObject(substr);
3739 if (substr == NULL) {
3740 Py_DECREF(substr);
3741 return -1;
3742 }
3743
3744 result = tailmatch((PyUnicodeObject *)str,
3745 (PyUnicodeObject *)substr,
3746 start, end, direction);
3747 Py_DECREF(str);
3748 Py_DECREF(substr);
3749 return result;
3750}
3751
3752static
3753const Py_UNICODE *findchar(const Py_UNICODE *s,
3754 int size,
3755 Py_UNICODE ch)
3756{
3757 /* like wcschr, but doesn't stop at NULL characters */
3758
3759 while (size-- > 0) {
3760 if (*s == ch)
3761 return s;
3762 s++;
3763 }
3764
3765 return NULL;
3766}
3767
3768/* Apply fixfct filter to the Unicode object self and return a
3769 reference to the modified object */
3770
3771static
3772PyObject *fixup(PyUnicodeObject *self,
3773 int (*fixfct)(PyUnicodeObject *s))
3774{
3775
3776 PyUnicodeObject *u;
3777
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003778 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 if (u == NULL)
3780 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003781
3782 Py_UNICODE_COPY(u->str, self->str, self->length);
3783
Tim Peters7a29bd52001-09-12 03:03:31 +00003784 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 /* fixfct should return TRUE if it modified the buffer. If
3786 FALSE, return a reference to the original buffer instead
3787 (to save space, not time) */
3788 Py_INCREF(self);
3789 Py_DECREF(u);
3790 return (PyObject*) self;
3791 }
3792 return (PyObject*) u;
3793}
3794
3795static
3796int fixupper(PyUnicodeObject *self)
3797{
3798 int len = self->length;
3799 Py_UNICODE *s = self->str;
3800 int status = 0;
3801
3802 while (len-- > 0) {
3803 register Py_UNICODE ch;
3804
3805 ch = Py_UNICODE_TOUPPER(*s);
3806 if (ch != *s) {
3807 status = 1;
3808 *s = ch;
3809 }
3810 s++;
3811 }
3812
3813 return status;
3814}
3815
3816static
3817int fixlower(PyUnicodeObject *self)
3818{
3819 int len = self->length;
3820 Py_UNICODE *s = self->str;
3821 int status = 0;
3822
3823 while (len-- > 0) {
3824 register Py_UNICODE ch;
3825
3826 ch = Py_UNICODE_TOLOWER(*s);
3827 if (ch != *s) {
3828 status = 1;
3829 *s = ch;
3830 }
3831 s++;
3832 }
3833
3834 return status;
3835}
3836
3837static
3838int fixswapcase(PyUnicodeObject *self)
3839{
3840 int len = self->length;
3841 Py_UNICODE *s = self->str;
3842 int status = 0;
3843
3844 while (len-- > 0) {
3845 if (Py_UNICODE_ISUPPER(*s)) {
3846 *s = Py_UNICODE_TOLOWER(*s);
3847 status = 1;
3848 } else if (Py_UNICODE_ISLOWER(*s)) {
3849 *s = Py_UNICODE_TOUPPER(*s);
3850 status = 1;
3851 }
3852 s++;
3853 }
3854
3855 return status;
3856}
3857
3858static
3859int fixcapitalize(PyUnicodeObject *self)
3860{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003861 int len = self->length;
3862 Py_UNICODE *s = self->str;
3863 int status = 0;
3864
3865 if (len == 0)
3866 return 0;
3867 if (Py_UNICODE_ISLOWER(*s)) {
3868 *s = Py_UNICODE_TOUPPER(*s);
3869 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003871 s++;
3872 while (--len > 0) {
3873 if (Py_UNICODE_ISUPPER(*s)) {
3874 *s = Py_UNICODE_TOLOWER(*s);
3875 status = 1;
3876 }
3877 s++;
3878 }
3879 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003880}
3881
3882static
3883int fixtitle(PyUnicodeObject *self)
3884{
3885 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3886 register Py_UNICODE *e;
3887 int previous_is_cased;
3888
3889 /* Shortcut for single character strings */
3890 if (PyUnicode_GET_SIZE(self) == 1) {
3891 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3892 if (*p != ch) {
3893 *p = ch;
3894 return 1;
3895 }
3896 else
3897 return 0;
3898 }
3899
3900 e = p + PyUnicode_GET_SIZE(self);
3901 previous_is_cased = 0;
3902 for (; p < e; p++) {
3903 register const Py_UNICODE ch = *p;
3904
3905 if (previous_is_cased)
3906 *p = Py_UNICODE_TOLOWER(ch);
3907 else
3908 *p = Py_UNICODE_TOTITLE(ch);
3909
3910 if (Py_UNICODE_ISLOWER(ch) ||
3911 Py_UNICODE_ISUPPER(ch) ||
3912 Py_UNICODE_ISTITLE(ch))
3913 previous_is_cased = 1;
3914 else
3915 previous_is_cased = 0;
3916 }
3917 return 1;
3918}
3919
3920PyObject *PyUnicode_Join(PyObject *separator,
3921 PyObject *seq)
3922{
3923 Py_UNICODE *sep;
3924 int seplen;
3925 PyUnicodeObject *res = NULL;
3926 int reslen = 0;
3927 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928 int sz = 100;
3929 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003930 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003931
Tim Peters2cfe3682001-05-05 05:36:48 +00003932 it = PyObject_GetIter(seq);
3933 if (it == NULL)
3934 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003935
3936 if (separator == NULL) {
3937 Py_UNICODE blank = ' ';
3938 sep = &blank;
3939 seplen = 1;
3940 }
3941 else {
3942 separator = PyUnicode_FromObject(separator);
3943 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003944 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945 sep = PyUnicode_AS_UNICODE(separator);
3946 seplen = PyUnicode_GET_SIZE(separator);
3947 }
3948
3949 res = _PyUnicode_New(sz);
3950 if (res == NULL)
3951 goto onError;
3952 p = PyUnicode_AS_UNICODE(res);
3953 reslen = 0;
3954
Tim Peters2cfe3682001-05-05 05:36:48 +00003955 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003957 PyObject *item = PyIter_Next(it);
3958 if (item == NULL) {
3959 if (PyErr_Occurred())
3960 goto onError;
3961 break;
3962 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963 if (!PyUnicode_Check(item)) {
3964 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003965 if (!PyString_Check(item)) {
3966 PyErr_Format(PyExc_TypeError,
3967 "sequence item %i: expected string or Unicode,"
3968 " %.80s found",
3969 i, item->ob_type->tp_name);
3970 Py_DECREF(item);
3971 goto onError;
3972 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003973 v = PyUnicode_FromObject(item);
3974 Py_DECREF(item);
3975 item = v;
3976 if (item == NULL)
3977 goto onError;
3978 }
3979 itemlen = PyUnicode_GET_SIZE(item);
3980 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003981 if (_PyUnicode_Resize(&res, sz*2)) {
3982 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003984 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985 sz *= 2;
3986 p = PyUnicode_AS_UNICODE(res) + reslen;
3987 }
3988 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003989 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003990 p += seplen;
3991 reslen += seplen;
3992 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003993 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003994 p += itemlen;
3995 reslen += itemlen;
3996 Py_DECREF(item);
3997 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003998 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003999 goto onError;
4000
4001 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004002 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003 return (PyObject *)res;
4004
4005 onError:
4006 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004007 Py_XDECREF(res);
4008 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009 return NULL;
4010}
4011
4012static
4013PyUnicodeObject *pad(PyUnicodeObject *self,
4014 int left,
4015 int right,
4016 Py_UNICODE fill)
4017{
4018 PyUnicodeObject *u;
4019
4020 if (left < 0)
4021 left = 0;
4022 if (right < 0)
4023 right = 0;
4024
Tim Peters7a29bd52001-09-12 03:03:31 +00004025 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026 Py_INCREF(self);
4027 return self;
4028 }
4029
4030 u = _PyUnicode_New(left + self->length + right);
4031 if (u) {
4032 if (left)
4033 Py_UNICODE_FILL(u->str, fill, left);
4034 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4035 if (right)
4036 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4037 }
4038
4039 return u;
4040}
4041
4042#define SPLIT_APPEND(data, left, right) \
4043 str = PyUnicode_FromUnicode(data + left, right - left); \
4044 if (!str) \
4045 goto onError; \
4046 if (PyList_Append(list, str)) { \
4047 Py_DECREF(str); \
4048 goto onError; \
4049 } \
4050 else \
4051 Py_DECREF(str);
4052
4053static
4054PyObject *split_whitespace(PyUnicodeObject *self,
4055 PyObject *list,
4056 int maxcount)
4057{
4058 register int i;
4059 register int j;
4060 int len = self->length;
4061 PyObject *str;
4062
4063 for (i = j = 0; i < len; ) {
4064 /* find a token */
4065 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4066 i++;
4067 j = i;
4068 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4069 i++;
4070 if (j < i) {
4071 if (maxcount-- <= 0)
4072 break;
4073 SPLIT_APPEND(self->str, j, i);
4074 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4075 i++;
4076 j = i;
4077 }
4078 }
4079 if (j < len) {
4080 SPLIT_APPEND(self->str, j, len);
4081 }
4082 return list;
4083
4084 onError:
4085 Py_DECREF(list);
4086 return NULL;
4087}
4088
4089PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004090 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091{
4092 register int i;
4093 register int j;
4094 int len;
4095 PyObject *list;
4096 PyObject *str;
4097 Py_UNICODE *data;
4098
4099 string = PyUnicode_FromObject(string);
4100 if (string == NULL)
4101 return NULL;
4102 data = PyUnicode_AS_UNICODE(string);
4103 len = PyUnicode_GET_SIZE(string);
4104
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105 list = PyList_New(0);
4106 if (!list)
4107 goto onError;
4108
4109 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004110 int eol;
4111
Guido van Rossumd57fd912000-03-10 22:53:23 +00004112 /* Find a line and append it */
4113 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4114 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115
4116 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004117 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118 if (i < len) {
4119 if (data[i] == '\r' && i + 1 < len &&
4120 data[i+1] == '\n')
4121 i += 2;
4122 else
4123 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004124 if (keepends)
4125 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126 }
Guido van Rossum86662912000-04-11 15:38:46 +00004127 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128 j = i;
4129 }
4130 if (j < len) {
4131 SPLIT_APPEND(data, j, len);
4132 }
4133
4134 Py_DECREF(string);
4135 return list;
4136
4137 onError:
4138 Py_DECREF(list);
4139 Py_DECREF(string);
4140 return NULL;
4141}
4142
4143static
4144PyObject *split_char(PyUnicodeObject *self,
4145 PyObject *list,
4146 Py_UNICODE ch,
4147 int maxcount)
4148{
4149 register int i;
4150 register int j;
4151 int len = self->length;
4152 PyObject *str;
4153
4154 for (i = j = 0; i < len; ) {
4155 if (self->str[i] == ch) {
4156 if (maxcount-- <= 0)
4157 break;
4158 SPLIT_APPEND(self->str, j, i);
4159 i = j = i + 1;
4160 } else
4161 i++;
4162 }
4163 if (j <= len) {
4164 SPLIT_APPEND(self->str, j, len);
4165 }
4166 return list;
4167
4168 onError:
4169 Py_DECREF(list);
4170 return NULL;
4171}
4172
4173static
4174PyObject *split_substring(PyUnicodeObject *self,
4175 PyObject *list,
4176 PyUnicodeObject *substring,
4177 int maxcount)
4178{
4179 register int i;
4180 register int j;
4181 int len = self->length;
4182 int sublen = substring->length;
4183 PyObject *str;
4184
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004185 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186 if (Py_UNICODE_MATCH(self, i, substring)) {
4187 if (maxcount-- <= 0)
4188 break;
4189 SPLIT_APPEND(self->str, j, i);
4190 i = j = i + sublen;
4191 } else
4192 i++;
4193 }
4194 if (j <= len) {
4195 SPLIT_APPEND(self->str, j, len);
4196 }
4197 return list;
4198
4199 onError:
4200 Py_DECREF(list);
4201 return NULL;
4202}
4203
4204#undef SPLIT_APPEND
4205
4206static
4207PyObject *split(PyUnicodeObject *self,
4208 PyUnicodeObject *substring,
4209 int maxcount)
4210{
4211 PyObject *list;
4212
4213 if (maxcount < 0)
4214 maxcount = INT_MAX;
4215
4216 list = PyList_New(0);
4217 if (!list)
4218 return NULL;
4219
4220 if (substring == NULL)
4221 return split_whitespace(self,list,maxcount);
4222
4223 else if (substring->length == 1)
4224 return split_char(self,list,substring->str[0],maxcount);
4225
4226 else if (substring->length == 0) {
4227 Py_DECREF(list);
4228 PyErr_SetString(PyExc_ValueError, "empty separator");
4229 return NULL;
4230 }
4231 else
4232 return split_substring(self,list,substring,maxcount);
4233}
4234
4235static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004236PyObject *replace(PyUnicodeObject *self,
4237 PyUnicodeObject *str1,
4238 PyUnicodeObject *str2,
4239 int maxcount)
4240{
4241 PyUnicodeObject *u;
4242
4243 if (maxcount < 0)
4244 maxcount = INT_MAX;
4245
4246 if (str1->length == 1 && str2->length == 1) {
4247 int i;
4248
4249 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004250 if (!findchar(self->str, self->length, str1->str[0]) &&
4251 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004252 /* nothing to replace, return original string */
4253 Py_INCREF(self);
4254 u = self;
4255 } else {
4256 Py_UNICODE u1 = str1->str[0];
4257 Py_UNICODE u2 = str2->str[0];
4258
4259 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004260 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004261 self->length
4262 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004263 if (u != NULL) {
4264 Py_UNICODE_COPY(u->str, self->str,
4265 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004266 for (i = 0; i < u->length; i++)
4267 if (u->str[i] == u1) {
4268 if (--maxcount < 0)
4269 break;
4270 u->str[i] = u2;
4271 }
4272 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004273 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004274
4275 } else {
4276 int n, i;
4277 Py_UNICODE *p;
4278
4279 /* replace strings */
4280 n = count(self, 0, self->length, str1);
4281 if (n > maxcount)
4282 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004283 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004285 if (PyUnicode_CheckExact(self)) {
4286 Py_INCREF(self);
4287 u = self;
4288 }
4289 else {
4290 u = (PyUnicodeObject *)
4291 PyUnicode_FromUnicode(self->str, self->length);
4292 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293 } else {
4294 u = _PyUnicode_New(
4295 self->length + n * (str2->length - str1->length));
4296 if (u) {
4297 i = 0;
4298 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004299 if (str1->length > 0) {
4300 while (i <= self->length - str1->length)
4301 if (Py_UNICODE_MATCH(self, i, str1)) {
4302 /* replace string segment */
4303 Py_UNICODE_COPY(p, str2->str, str2->length);
4304 p += str2->length;
4305 i += str1->length;
4306 if (--n <= 0) {
4307 /* copy remaining part */
4308 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4309 break;
4310 }
4311 } else
4312 *p++ = self->str[i++];
4313 } else {
4314 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315 Py_UNICODE_COPY(p, str2->str, str2->length);
4316 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004317 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004320 }
4321 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4322 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323 }
4324 }
4325 }
4326
4327 return (PyObject *) u;
4328}
4329
4330/* --- Unicode Object Methods --------------------------------------------- */
4331
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004332PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004333"S.title() -> unicode\n\
4334\n\
4335Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004336characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004337
4338static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004339unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341 return fixup(self, fixtitle);
4342}
4343
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004344PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345"S.capitalize() -> unicode\n\
4346\n\
4347Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004348have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349
4350static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004351unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004352{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353 return fixup(self, fixcapitalize);
4354}
4355
4356#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004357PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358"S.capwords() -> unicode\n\
4359\n\
4360Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004361normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362
4363static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004364unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365{
4366 PyObject *list;
4367 PyObject *item;
4368 int i;
4369
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370 /* Split into words */
4371 list = split(self, NULL, -1);
4372 if (!list)
4373 return NULL;
4374
4375 /* Capitalize each word */
4376 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4377 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4378 fixcapitalize);
4379 if (item == NULL)
4380 goto onError;
4381 Py_DECREF(PyList_GET_ITEM(list, i));
4382 PyList_SET_ITEM(list, i, item);
4383 }
4384
4385 /* Join the words to form a new string */
4386 item = PyUnicode_Join(NULL, list);
4387
4388onError:
4389 Py_DECREF(list);
4390 return (PyObject *)item;
4391}
4392#endif
4393
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004394PyDoc_STRVAR(center__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004395"S.center(width) -> unicode\n\
4396\n\
4397Return S centered in a Unicode string of length width. Padding is done\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004398using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399
4400static PyObject *
4401unicode_center(PyUnicodeObject *self, PyObject *args)
4402{
4403 int marg, left;
4404 int width;
4405
4406 if (!PyArg_ParseTuple(args, "i:center", &width))
4407 return NULL;
4408
Tim Peters7a29bd52001-09-12 03:03:31 +00004409 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004410 Py_INCREF(self);
4411 return (PyObject*) self;
4412 }
4413
4414 marg = width - self->length;
4415 left = marg / 2 + (marg & width & 1);
4416
4417 return (PyObject*) pad(self, left, marg - left, ' ');
4418}
4419
Marc-André Lemburge5034372000-08-08 08:04:29 +00004420#if 0
4421
4422/* This code should go into some future Unicode collation support
4423 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004424 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004425
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004426/* speedy UTF-16 code point order comparison */
4427/* gleaned from: */
4428/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4429
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004430static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004431{
4432 0, 0, 0, 0, 0, 0, 0, 0,
4433 0, 0, 0, 0, 0, 0, 0, 0,
4434 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004435 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004436};
4437
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438static int
4439unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4440{
4441 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004442
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443 Py_UNICODE *s1 = str1->str;
4444 Py_UNICODE *s2 = str2->str;
4445
4446 len1 = str1->length;
4447 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004448
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004450 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004451
4452 c1 = *s1++;
4453 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004454
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004455 if (c1 > (1<<11) * 26)
4456 c1 += utf16Fixup[c1>>11];
4457 if (c2 > (1<<11) * 26)
4458 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004459 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004460
4461 if (c1 != c2)
4462 return (c1 < c2) ? -1 : 1;
4463
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004464 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465 }
4466
4467 return (len1 < len2) ? -1 : (len1 != len2);
4468}
4469
Marc-André Lemburge5034372000-08-08 08:04:29 +00004470#else
4471
4472static int
4473unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4474{
4475 register int len1, len2;
4476
4477 Py_UNICODE *s1 = str1->str;
4478 Py_UNICODE *s2 = str2->str;
4479
4480 len1 = str1->length;
4481 len2 = str2->length;
4482
4483 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00004484 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004485
Fredrik Lundh45714e92001-06-26 16:39:36 +00004486 c1 = *s1++;
4487 c2 = *s2++;
4488
4489 if (c1 != c2)
4490 return (c1 < c2) ? -1 : 1;
4491
Marc-André Lemburge5034372000-08-08 08:04:29 +00004492 len1--; len2--;
4493 }
4494
4495 return (len1 < len2) ? -1 : (len1 != len2);
4496}
4497
4498#endif
4499
Guido van Rossumd57fd912000-03-10 22:53:23 +00004500int PyUnicode_Compare(PyObject *left,
4501 PyObject *right)
4502{
4503 PyUnicodeObject *u = NULL, *v = NULL;
4504 int result;
4505
4506 /* Coerce the two arguments */
4507 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4508 if (u == NULL)
4509 goto onError;
4510 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4511 if (v == NULL)
4512 goto onError;
4513
Thomas Wouters7e474022000-07-16 12:04:32 +00004514 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515 if (v == u) {
4516 Py_DECREF(u);
4517 Py_DECREF(v);
4518 return 0;
4519 }
4520
4521 result = unicode_compare(u, v);
4522
4523 Py_DECREF(u);
4524 Py_DECREF(v);
4525 return result;
4526
4527onError:
4528 Py_XDECREF(u);
4529 Py_XDECREF(v);
4530 return -1;
4531}
4532
Guido van Rossum403d68b2000-03-13 15:55:09 +00004533int PyUnicode_Contains(PyObject *container,
4534 PyObject *element)
4535{
4536 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004537 int result, size;
4538 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004539
4540 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004541 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004542 if (v == NULL) {
4543 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004544 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004545 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004546 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004547 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004548 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004549 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004550
Barry Warsaw817918c2002-08-06 16:58:21 +00004551 size = PyUnicode_GET_SIZE(v);
4552 rhs = PyUnicode_AS_UNICODE(v);
4553 lhs = PyUnicode_AS_UNICODE(u);
4554
Guido van Rossum403d68b2000-03-13 15:55:09 +00004555 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004556 if (size == 1) {
4557 end = lhs + PyUnicode_GET_SIZE(u);
4558 while (lhs < end) {
4559 if (*lhs++ == *rhs) {
4560 result = 1;
4561 break;
4562 }
4563 }
4564 }
4565 else {
4566 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4567 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004568 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004569 result = 1;
4570 break;
4571 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004572 }
4573 }
4574
4575 Py_DECREF(u);
4576 Py_DECREF(v);
4577 return result;
4578
4579onError:
4580 Py_XDECREF(u);
4581 Py_XDECREF(v);
4582 return -1;
4583}
4584
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585/* Concat to string or Unicode object giving a new Unicode object. */
4586
4587PyObject *PyUnicode_Concat(PyObject *left,
4588 PyObject *right)
4589{
4590 PyUnicodeObject *u = NULL, *v = NULL, *w;
4591
4592 /* Coerce the two arguments */
4593 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4594 if (u == NULL)
4595 goto onError;
4596 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4597 if (v == NULL)
4598 goto onError;
4599
4600 /* Shortcuts */
4601 if (v == unicode_empty) {
4602 Py_DECREF(v);
4603 return (PyObject *)u;
4604 }
4605 if (u == unicode_empty) {
4606 Py_DECREF(u);
4607 return (PyObject *)v;
4608 }
4609
4610 /* Concat the two Unicode strings */
4611 w = _PyUnicode_New(u->length + v->length);
4612 if (w == NULL)
4613 goto onError;
4614 Py_UNICODE_COPY(w->str, u->str, u->length);
4615 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4616
4617 Py_DECREF(u);
4618 Py_DECREF(v);
4619 return (PyObject *)w;
4620
4621onError:
4622 Py_XDECREF(u);
4623 Py_XDECREF(v);
4624 return NULL;
4625}
4626
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004627PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628"S.count(sub[, start[, end]]) -> int\n\
4629\n\
4630Return the number of occurrences of substring sub in Unicode string\n\
4631S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004632interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004633
4634static PyObject *
4635unicode_count(PyUnicodeObject *self, PyObject *args)
4636{
4637 PyUnicodeObject *substring;
4638 int start = 0;
4639 int end = INT_MAX;
4640 PyObject *result;
4641
Guido van Rossumb8872e62000-05-09 14:14:27 +00004642 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4643 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004644 return NULL;
4645
4646 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4647 (PyObject *)substring);
4648 if (substring == NULL)
4649 return NULL;
4650
Guido van Rossumd57fd912000-03-10 22:53:23 +00004651 if (start < 0)
4652 start += self->length;
4653 if (start < 0)
4654 start = 0;
4655 if (end > self->length)
4656 end = self->length;
4657 if (end < 0)
4658 end += self->length;
4659 if (end < 0)
4660 end = 0;
4661
4662 result = PyInt_FromLong((long) count(self, start, end, substring));
4663
4664 Py_DECREF(substring);
4665 return result;
4666}
4667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004668PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004669"S.encode([encoding[,errors]]) -> string\n\
4670\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004671Return an encoded string version of S. Default encoding is the current\n\
4672default string encoding. errors may be given to set a different error\n\
4673handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004674a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4675'xmlcharrefreplace' as well as any other name registered with\n\
4676codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004677
4678static PyObject *
4679unicode_encode(PyUnicodeObject *self, PyObject *args)
4680{
4681 char *encoding = NULL;
4682 char *errors = NULL;
4683 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4684 return NULL;
4685 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4686}
4687
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004688PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689"S.expandtabs([tabsize]) -> unicode\n\
4690\n\
4691Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004692If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693
4694static PyObject*
4695unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4696{
4697 Py_UNICODE *e;
4698 Py_UNICODE *p;
4699 Py_UNICODE *q;
4700 int i, j;
4701 PyUnicodeObject *u;
4702 int tabsize = 8;
4703
4704 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4705 return NULL;
4706
Thomas Wouters7e474022000-07-16 12:04:32 +00004707 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004708 i = j = 0;
4709 e = self->str + self->length;
4710 for (p = self->str; p < e; p++)
4711 if (*p == '\t') {
4712 if (tabsize > 0)
4713 j += tabsize - (j % tabsize);
4714 }
4715 else {
4716 j++;
4717 if (*p == '\n' || *p == '\r') {
4718 i += j;
4719 j = 0;
4720 }
4721 }
4722
4723 /* Second pass: create output string and fill it */
4724 u = _PyUnicode_New(i + j);
4725 if (!u)
4726 return NULL;
4727
4728 j = 0;
4729 q = u->str;
4730
4731 for (p = self->str; p < e; p++)
4732 if (*p == '\t') {
4733 if (tabsize > 0) {
4734 i = tabsize - (j % tabsize);
4735 j += i;
4736 while (i--)
4737 *q++ = ' ';
4738 }
4739 }
4740 else {
4741 j++;
4742 *q++ = *p;
4743 if (*p == '\n' || *p == '\r')
4744 j = 0;
4745 }
4746
4747 return (PyObject*) u;
4748}
4749
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004750PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751"S.find(sub [,start [,end]]) -> int\n\
4752\n\
4753Return the lowest index in S where substring sub is found,\n\
4754such that sub is contained within s[start,end]. Optional\n\
4755arguments start and end are interpreted as in slice notation.\n\
4756\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004757Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758
4759static PyObject *
4760unicode_find(PyUnicodeObject *self, PyObject *args)
4761{
4762 PyUnicodeObject *substring;
4763 int start = 0;
4764 int end = INT_MAX;
4765 PyObject *result;
4766
Guido van Rossumb8872e62000-05-09 14:14:27 +00004767 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4768 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769 return NULL;
4770 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4771 (PyObject *)substring);
4772 if (substring == NULL)
4773 return NULL;
4774
4775 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4776
4777 Py_DECREF(substring);
4778 return result;
4779}
4780
4781static PyObject *
4782unicode_getitem(PyUnicodeObject *self, int index)
4783{
4784 if (index < 0 || index >= self->length) {
4785 PyErr_SetString(PyExc_IndexError, "string index out of range");
4786 return NULL;
4787 }
4788
4789 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4790}
4791
4792static long
4793unicode_hash(PyUnicodeObject *self)
4794{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004795 /* Since Unicode objects compare equal to their ASCII string
4796 counterparts, they should use the individual character values
4797 as basis for their hash value. This is needed to assure that
4798 strings and Unicode objects behave in the same way as
4799 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800
Fredrik Lundhdde61642000-07-10 18:27:47 +00004801 register int len;
4802 register Py_UNICODE *p;
4803 register long x;
4804
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805 if (self->hash != -1)
4806 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004807 len = PyUnicode_GET_SIZE(self);
4808 p = PyUnicode_AS_UNICODE(self);
4809 x = *p << 7;
4810 while (--len >= 0)
4811 x = (1000003*x) ^ *p++;
4812 x ^= PyUnicode_GET_SIZE(self);
4813 if (x == -1)
4814 x = -2;
4815 self->hash = x;
4816 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817}
4818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004819PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820"S.index(sub [,start [,end]]) -> int\n\
4821\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004822Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823
4824static PyObject *
4825unicode_index(PyUnicodeObject *self, PyObject *args)
4826{
4827 int result;
4828 PyUnicodeObject *substring;
4829 int start = 0;
4830 int end = INT_MAX;
4831
Guido van Rossumb8872e62000-05-09 14:14:27 +00004832 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4833 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834 return NULL;
4835
4836 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4837 (PyObject *)substring);
4838 if (substring == NULL)
4839 return NULL;
4840
4841 result = findstring(self, substring, start, end, 1);
4842
4843 Py_DECREF(substring);
4844 if (result < 0) {
4845 PyErr_SetString(PyExc_ValueError, "substring not found");
4846 return NULL;
4847 }
4848 return PyInt_FromLong(result);
4849}
4850
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004851PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004852"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004854Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004855at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856
4857static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004858unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859{
4860 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4861 register const Py_UNICODE *e;
4862 int cased;
4863
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 /* Shortcut for single character strings */
4865 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004866 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004868 /* Special case for empty strings */
4869 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004870 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004871
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872 e = p + PyUnicode_GET_SIZE(self);
4873 cased = 0;
4874 for (; p < e; p++) {
4875 register const Py_UNICODE ch = *p;
4876
4877 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004878 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879 else if (!cased && Py_UNICODE_ISLOWER(ch))
4880 cased = 1;
4881 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004882 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883}
4884
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004885PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004886"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004887\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004888Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004889at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890
4891static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004892unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893{
4894 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4895 register const Py_UNICODE *e;
4896 int cased;
4897
Guido van Rossumd57fd912000-03-10 22:53:23 +00004898 /* Shortcut for single character strings */
4899 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004900 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004902 /* Special case for empty strings */
4903 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004904 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004905
Guido van Rossumd57fd912000-03-10 22:53:23 +00004906 e = p + PyUnicode_GET_SIZE(self);
4907 cased = 0;
4908 for (; p < e; p++) {
4909 register const Py_UNICODE ch = *p;
4910
4911 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004912 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913 else if (!cased && Py_UNICODE_ISUPPER(ch))
4914 cased = 1;
4915 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004916 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917}
4918
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004919PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004920"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004921\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004922Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4923characters may only follow uncased characters and lowercase characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004924only cased ones. Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925
4926static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004927unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928{
4929 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4930 register const Py_UNICODE *e;
4931 int cased, previous_is_cased;
4932
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933 /* Shortcut for single character strings */
4934 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004935 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4936 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004938 /* Special case for empty strings */
4939 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004940 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004941
Guido van Rossumd57fd912000-03-10 22:53:23 +00004942 e = p + PyUnicode_GET_SIZE(self);
4943 cased = 0;
4944 previous_is_cased = 0;
4945 for (; p < e; p++) {
4946 register const Py_UNICODE ch = *p;
4947
4948 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4949 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004950 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004951 previous_is_cased = 1;
4952 cased = 1;
4953 }
4954 else if (Py_UNICODE_ISLOWER(ch)) {
4955 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004956 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004957 previous_is_cased = 1;
4958 cased = 1;
4959 }
4960 else
4961 previous_is_cased = 0;
4962 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004963 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964}
4965
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004966PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004967"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004969Return True if there are only whitespace characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004970False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004971
4972static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004973unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974{
4975 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4976 register const Py_UNICODE *e;
4977
Guido van Rossumd57fd912000-03-10 22:53:23 +00004978 /* Shortcut for single character strings */
4979 if (PyUnicode_GET_SIZE(self) == 1 &&
4980 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004981 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004982
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004983 /* Special case for empty strings */
4984 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004985 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004986
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987 e = p + PyUnicode_GET_SIZE(self);
4988 for (; p < e; p++) {
4989 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004990 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004992 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993}
4994
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004995PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004996"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004997\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004998Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004999and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005000
5001static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005002unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005003{
5004 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5005 register const Py_UNICODE *e;
5006
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005007 /* Shortcut for single character strings */
5008 if (PyUnicode_GET_SIZE(self) == 1 &&
5009 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005010 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005011
5012 /* Special case for empty strings */
5013 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005014 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005015
5016 e = p + PyUnicode_GET_SIZE(self);
5017 for (; p < e; p++) {
5018 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005019 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005020 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005021 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005022}
5023
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005024PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005025"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005026\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005027Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005028and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005029
5030static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005031unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005032{
5033 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5034 register const Py_UNICODE *e;
5035
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005036 /* Shortcut for single character strings */
5037 if (PyUnicode_GET_SIZE(self) == 1 &&
5038 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005039 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005040
5041 /* Special case for empty strings */
5042 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005043 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005044
5045 e = p + PyUnicode_GET_SIZE(self);
5046 for (; p < e; p++) {
5047 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005048 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005049 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005050 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005051}
5052
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005053PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005054"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005056Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005057False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058
5059static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005060unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005061{
5062 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5063 register const Py_UNICODE *e;
5064
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065 /* Shortcut for single character strings */
5066 if (PyUnicode_GET_SIZE(self) == 1 &&
5067 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005068 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005069
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005070 /* Special case for empty strings */
5071 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005072 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005073
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074 e = p + PyUnicode_GET_SIZE(self);
5075 for (; p < e; p++) {
5076 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005077 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005079 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080}
5081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005082PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005083"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005085Return True if there are only digit characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005086False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005087
5088static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005089unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090{
5091 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5092 register const Py_UNICODE *e;
5093
Guido van Rossumd57fd912000-03-10 22:53:23 +00005094 /* Shortcut for single character strings */
5095 if (PyUnicode_GET_SIZE(self) == 1 &&
5096 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005097 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005099 /* Special case for empty strings */
5100 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005101 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005102
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103 e = p + PyUnicode_GET_SIZE(self);
5104 for (; p < e; p++) {
5105 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005106 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005108 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109}
5110
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005111PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005112"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005113\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005114Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005115False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116
5117static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005118unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119{
5120 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5121 register const Py_UNICODE *e;
5122
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123 /* Shortcut for single character strings */
5124 if (PyUnicode_GET_SIZE(self) == 1 &&
5125 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005126 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005128 /* Special case for empty strings */
5129 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005130 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005131
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132 e = p + PyUnicode_GET_SIZE(self);
5133 for (; p < e; p++) {
5134 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005135 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005137 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005138}
5139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005140PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141"S.join(sequence) -> unicode\n\
5142\n\
5143Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005144sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145
5146static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005147unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005149 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150}
5151
5152static int
5153unicode_length(PyUnicodeObject *self)
5154{
5155 return self->length;
5156}
5157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005158PyDoc_STRVAR(ljust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159"S.ljust(width) -> unicode\n\
5160\n\
5161Return S left justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005162done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163
5164static PyObject *
5165unicode_ljust(PyUnicodeObject *self, PyObject *args)
5166{
5167 int width;
5168 if (!PyArg_ParseTuple(args, "i:ljust", &width))
5169 return NULL;
5170
Tim Peters7a29bd52001-09-12 03:03:31 +00005171 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172 Py_INCREF(self);
5173 return (PyObject*) self;
5174 }
5175
5176 return (PyObject*) pad(self, 0, width - self->length, ' ');
5177}
5178
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005179PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180"S.lower() -> unicode\n\
5181\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005182Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183
5184static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005185unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187 return fixup(self, fixlower);
5188}
5189
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005190#define LEFTSTRIP 0
5191#define RIGHTSTRIP 1
5192#define BOTHSTRIP 2
5193
5194/* Arrays indexed by above */
5195static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5196
5197#define STRIPNAME(i) (stripformat[i]+3)
5198
5199static const Py_UNICODE *
5200unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5201{
Tim Peters030a5ce2002-04-22 19:00:10 +00005202 size_t i;
5203 for (i = 0; i < n; ++i)
5204 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005205 return s+i;
5206 return NULL;
5207}
5208
5209/* externally visible for str.strip(unicode) */
5210PyObject *
5211_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5212{
5213 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5214 int len = PyUnicode_GET_SIZE(self);
5215 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5216 int seplen = PyUnicode_GET_SIZE(sepobj);
5217 int i, j;
5218
5219 i = 0;
5220 if (striptype != RIGHTSTRIP) {
5221 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5222 i++;
5223 }
5224 }
5225
5226 j = len;
5227 if (striptype != LEFTSTRIP) {
5228 do {
5229 j--;
5230 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5231 j++;
5232 }
5233
5234 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5235 Py_INCREF(self);
5236 return (PyObject*)self;
5237 }
5238 else
5239 return PyUnicode_FromUnicode(s+i, j-i);
5240}
5241
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242
5243static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005244do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005246 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5247 int len = PyUnicode_GET_SIZE(self), i, j;
5248
5249 i = 0;
5250 if (striptype != RIGHTSTRIP) {
5251 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5252 i++;
5253 }
5254 }
5255
5256 j = len;
5257 if (striptype != LEFTSTRIP) {
5258 do {
5259 j--;
5260 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5261 j++;
5262 }
5263
5264 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5265 Py_INCREF(self);
5266 return (PyObject*)self;
5267 }
5268 else
5269 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270}
5271
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005272
5273static PyObject *
5274do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5275{
5276 PyObject *sep = NULL;
5277
5278 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5279 return NULL;
5280
5281 if (sep != NULL && sep != Py_None) {
5282 if (PyUnicode_Check(sep))
5283 return _PyUnicode_XStrip(self, striptype, sep);
5284 else if (PyString_Check(sep)) {
5285 PyObject *res;
5286 sep = PyUnicode_FromObject(sep);
5287 if (sep==NULL)
5288 return NULL;
5289 res = _PyUnicode_XStrip(self, striptype, sep);
5290 Py_DECREF(sep);
5291 return res;
5292 }
5293 else {
5294 PyErr_Format(PyExc_TypeError,
5295 "%s arg must be None, unicode or str",
5296 STRIPNAME(striptype));
5297 return NULL;
5298 }
5299 }
5300
5301 return do_strip(self, striptype);
5302}
5303
5304
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005305PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005306"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005307\n\
5308Return a copy of the string S with leading and trailing\n\
5309whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005310If chars is given and not None, remove characters in chars instead.\n\
5311If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005312
5313static PyObject *
5314unicode_strip(PyUnicodeObject *self, PyObject *args)
5315{
5316 if (PyTuple_GET_SIZE(args) == 0)
5317 return do_strip(self, BOTHSTRIP); /* Common case */
5318 else
5319 return do_argstrip(self, BOTHSTRIP, args);
5320}
5321
5322
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005323PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005324"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005325\n\
5326Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005327If chars is given and not None, remove characters in chars instead.\n\
5328If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005329
5330static PyObject *
5331unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5332{
5333 if (PyTuple_GET_SIZE(args) == 0)
5334 return do_strip(self, LEFTSTRIP); /* Common case */
5335 else
5336 return do_argstrip(self, LEFTSTRIP, args);
5337}
5338
5339
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005340PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005341"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005342\n\
5343Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005344If chars is given and not None, remove characters in chars instead.\n\
5345If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005346
5347static PyObject *
5348unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5349{
5350 if (PyTuple_GET_SIZE(args) == 0)
5351 return do_strip(self, RIGHTSTRIP); /* Common case */
5352 else
5353 return do_argstrip(self, RIGHTSTRIP, args);
5354}
5355
5356
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357static PyObject*
5358unicode_repeat(PyUnicodeObject *str, int len)
5359{
5360 PyUnicodeObject *u;
5361 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005362 int nchars;
5363 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364
5365 if (len < 0)
5366 len = 0;
5367
Tim Peters7a29bd52001-09-12 03:03:31 +00005368 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 /* no repeat, return original string */
5370 Py_INCREF(str);
5371 return (PyObject*) str;
5372 }
Tim Peters8f422462000-09-09 06:13:41 +00005373
5374 /* ensure # of chars needed doesn't overflow int and # of bytes
5375 * needed doesn't overflow size_t
5376 */
5377 nchars = len * str->length;
5378 if (len && nchars / len != str->length) {
5379 PyErr_SetString(PyExc_OverflowError,
5380 "repeated string is too long");
5381 return NULL;
5382 }
5383 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5384 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5385 PyErr_SetString(PyExc_OverflowError,
5386 "repeated string is too long");
5387 return NULL;
5388 }
5389 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 if (!u)
5391 return NULL;
5392
5393 p = u->str;
5394
5395 while (len-- > 0) {
5396 Py_UNICODE_COPY(p, str->str, str->length);
5397 p += str->length;
5398 }
5399
5400 return (PyObject*) u;
5401}
5402
5403PyObject *PyUnicode_Replace(PyObject *obj,
5404 PyObject *subobj,
5405 PyObject *replobj,
5406 int maxcount)
5407{
5408 PyObject *self;
5409 PyObject *str1;
5410 PyObject *str2;
5411 PyObject *result;
5412
5413 self = PyUnicode_FromObject(obj);
5414 if (self == NULL)
5415 return NULL;
5416 str1 = PyUnicode_FromObject(subobj);
5417 if (str1 == NULL) {
5418 Py_DECREF(self);
5419 return NULL;
5420 }
5421 str2 = PyUnicode_FromObject(replobj);
5422 if (str2 == NULL) {
5423 Py_DECREF(self);
5424 Py_DECREF(str1);
5425 return NULL;
5426 }
5427 result = replace((PyUnicodeObject *)self,
5428 (PyUnicodeObject *)str1,
5429 (PyUnicodeObject *)str2,
5430 maxcount);
5431 Py_DECREF(self);
5432 Py_DECREF(str1);
5433 Py_DECREF(str2);
5434 return result;
5435}
5436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005437PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438"S.replace (old, new[, maxsplit]) -> unicode\n\
5439\n\
5440Return a copy of S with all occurrences of substring\n\
5441old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005442given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443
5444static PyObject*
5445unicode_replace(PyUnicodeObject *self, PyObject *args)
5446{
5447 PyUnicodeObject *str1;
5448 PyUnicodeObject *str2;
5449 int maxcount = -1;
5450 PyObject *result;
5451
5452 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5453 return NULL;
5454 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5455 if (str1 == NULL)
5456 return NULL;
5457 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005458 if (str2 == NULL) {
5459 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005461 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462
5463 result = replace(self, str1, str2, maxcount);
5464
5465 Py_DECREF(str1);
5466 Py_DECREF(str2);
5467 return result;
5468}
5469
5470static
5471PyObject *unicode_repr(PyObject *unicode)
5472{
5473 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5474 PyUnicode_GET_SIZE(unicode),
5475 1);
5476}
5477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005478PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479"S.rfind(sub [,start [,end]]) -> int\n\
5480\n\
5481Return the highest index in S where substring sub is found,\n\
5482such that sub is contained within s[start,end]. Optional\n\
5483arguments start and end are interpreted as in slice notation.\n\
5484\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005485Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486
5487static PyObject *
5488unicode_rfind(PyUnicodeObject *self, PyObject *args)
5489{
5490 PyUnicodeObject *substring;
5491 int start = 0;
5492 int end = INT_MAX;
5493 PyObject *result;
5494
Guido van Rossumb8872e62000-05-09 14:14:27 +00005495 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5496 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 return NULL;
5498 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5499 (PyObject *)substring);
5500 if (substring == NULL)
5501 return NULL;
5502
5503 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5504
5505 Py_DECREF(substring);
5506 return result;
5507}
5508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005509PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510"S.rindex(sub [,start [,end]]) -> int\n\
5511\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005512Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513
5514static PyObject *
5515unicode_rindex(PyUnicodeObject *self, PyObject *args)
5516{
5517 int result;
5518 PyUnicodeObject *substring;
5519 int start = 0;
5520 int end = INT_MAX;
5521
Guido van Rossumb8872e62000-05-09 14:14:27 +00005522 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5523 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524 return NULL;
5525 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5526 (PyObject *)substring);
5527 if (substring == NULL)
5528 return NULL;
5529
5530 result = findstring(self, substring, start, end, -1);
5531
5532 Py_DECREF(substring);
5533 if (result < 0) {
5534 PyErr_SetString(PyExc_ValueError, "substring not found");
5535 return NULL;
5536 }
5537 return PyInt_FromLong(result);
5538}
5539
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005540PyDoc_STRVAR(rjust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541"S.rjust(width) -> unicode\n\
5542\n\
5543Return S right justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005544done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545
5546static PyObject *
5547unicode_rjust(PyUnicodeObject *self, PyObject *args)
5548{
5549 int width;
5550 if (!PyArg_ParseTuple(args, "i:rjust", &width))
5551 return NULL;
5552
Tim Peters7a29bd52001-09-12 03:03:31 +00005553 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 Py_INCREF(self);
5555 return (PyObject*) self;
5556 }
5557
5558 return (PyObject*) pad(self, width - self->length, 0, ' ');
5559}
5560
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561static PyObject*
5562unicode_slice(PyUnicodeObject *self, int start, int end)
5563{
5564 /* standard clamping */
5565 if (start < 0)
5566 start = 0;
5567 if (end < 0)
5568 end = 0;
5569 if (end > self->length)
5570 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005571 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 /* full slice, return original string */
5573 Py_INCREF(self);
5574 return (PyObject*) self;
5575 }
5576 if (start > end)
5577 start = end;
5578 /* copy slice */
5579 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5580 end - start);
5581}
5582
5583PyObject *PyUnicode_Split(PyObject *s,
5584 PyObject *sep,
5585 int maxsplit)
5586{
5587 PyObject *result;
5588
5589 s = PyUnicode_FromObject(s);
5590 if (s == NULL)
5591 return NULL;
5592 if (sep != NULL) {
5593 sep = PyUnicode_FromObject(sep);
5594 if (sep == NULL) {
5595 Py_DECREF(s);
5596 return NULL;
5597 }
5598 }
5599
5600 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5601
5602 Py_DECREF(s);
5603 Py_XDECREF(sep);
5604 return result;
5605}
5606
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005607PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608"S.split([sep [,maxsplit]]) -> list of strings\n\
5609\n\
5610Return a list of the words in S, using sep as the\n\
5611delimiter string. If maxsplit is given, at most maxsplit\n\
5612splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005613is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614
5615static PyObject*
5616unicode_split(PyUnicodeObject *self, PyObject *args)
5617{
5618 PyObject *substring = Py_None;
5619 int maxcount = -1;
5620
5621 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5622 return NULL;
5623
5624 if (substring == Py_None)
5625 return split(self, NULL, maxcount);
5626 else if (PyUnicode_Check(substring))
5627 return split(self, (PyUnicodeObject *)substring, maxcount);
5628 else
5629 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5630}
5631
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005632PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00005633"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634\n\
5635Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00005636Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005637is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638
5639static PyObject*
5640unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5641{
Guido van Rossum86662912000-04-11 15:38:46 +00005642 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643
Guido van Rossum86662912000-04-11 15:38:46 +00005644 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 return NULL;
5646
Guido van Rossum86662912000-04-11 15:38:46 +00005647 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648}
5649
5650static
5651PyObject *unicode_str(PyUnicodeObject *self)
5652{
Fred Drakee4315f52000-05-09 19:53:39 +00005653 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654}
5655
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005656PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657"S.swapcase() -> unicode\n\
5658\n\
5659Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005660and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661
5662static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005663unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 return fixup(self, fixswapcase);
5666}
5667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005668PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669"S.translate(table) -> unicode\n\
5670\n\
5671Return a copy of the string S, where all characters have been mapped\n\
5672through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00005673Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5674Unmapped characters are left untouched. Characters mapped to None\n\
5675are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676
5677static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005678unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 return PyUnicode_TranslateCharmap(self->str,
5681 self->length,
5682 table,
5683 "ignore");
5684}
5685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005686PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687"S.upper() -> unicode\n\
5688\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005689Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690
5691static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005692unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 return fixup(self, fixupper);
5695}
5696
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005697PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698"S.zfill(width) -> unicode\n\
5699\n\
5700Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005701of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702
5703static PyObject *
5704unicode_zfill(PyUnicodeObject *self, PyObject *args)
5705{
5706 int fill;
5707 PyUnicodeObject *u;
5708
5709 int width;
5710 if (!PyArg_ParseTuple(args, "i:zfill", &width))
5711 return NULL;
5712
5713 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00005714 if (PyUnicode_CheckExact(self)) {
5715 Py_INCREF(self);
5716 return (PyObject*) self;
5717 }
5718 else
5719 return PyUnicode_FromUnicode(
5720 PyUnicode_AS_UNICODE(self),
5721 PyUnicode_GET_SIZE(self)
5722 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723 }
5724
5725 fill = width - self->length;
5726
5727 u = pad(self, fill, 0, '0');
5728
Walter Dörwald068325e2002-04-15 13:36:47 +00005729 if (u == NULL)
5730 return NULL;
5731
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 if (u->str[fill] == '+' || u->str[fill] == '-') {
5733 /* move sign to beginning of string */
5734 u->str[0] = u->str[fill];
5735 u->str[fill] = '0';
5736 }
5737
5738 return (PyObject*) u;
5739}
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740
5741#if 0
5742static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005743unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745 return PyInt_FromLong(unicode_freelist_size);
5746}
5747#endif
5748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005749PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005750"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00005752Return True if S starts with the specified prefix, False otherwise.\n\
5753With optional start, test S beginning at that position.\n\
5754With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755
5756static PyObject *
5757unicode_startswith(PyUnicodeObject *self,
5758 PyObject *args)
5759{
5760 PyUnicodeObject *substring;
5761 int start = 0;
5762 int end = INT_MAX;
5763 PyObject *result;
5764
Guido van Rossumb8872e62000-05-09 14:14:27 +00005765 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5766 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767 return NULL;
5768 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5769 (PyObject *)substring);
5770 if (substring == NULL)
5771 return NULL;
5772
Guido van Rossum77f6a652002-04-03 22:41:51 +00005773 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774
5775 Py_DECREF(substring);
5776 return result;
5777}
5778
5779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005780PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005781"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00005783Return True if S ends with the specified suffix, False otherwise.\n\
5784With optional start, test S beginning at that position.\n\
5785With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786
5787static PyObject *
5788unicode_endswith(PyUnicodeObject *self,
5789 PyObject *args)
5790{
5791 PyUnicodeObject *substring;
5792 int start = 0;
5793 int end = INT_MAX;
5794 PyObject *result;
5795
Guido van Rossumb8872e62000-05-09 14:14:27 +00005796 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5797 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798 return NULL;
5799 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5800 (PyObject *)substring);
5801 if (substring == NULL)
5802 return NULL;
5803
Guido van Rossum77f6a652002-04-03 22:41:51 +00005804 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805
5806 Py_DECREF(substring);
5807 return result;
5808}
5809
5810
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005811
5812static PyObject *
5813unicode_getnewargs(PyUnicodeObject *v)
5814{
5815 return Py_BuildValue("(u#)", v->str, v->length);
5816}
5817
5818
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819static PyMethodDef unicode_methods[] = {
5820
5821 /* Order is according to common usage: often used methods should
5822 appear first, since lookup is done sequentially. */
5823
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005824 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5825 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5826 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5827 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5828 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5829 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5830 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5831 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5832 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5833 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5834 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5835 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5836 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005837 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005838/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5839 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5840 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5841 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005842 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005843 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005844 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005845 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5846 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5847 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5848 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5849 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5850 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5851 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5852 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5853 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5854 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5855 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5856 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5857 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5858 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005859 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00005860#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005861 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862#endif
5863
5864#if 0
5865 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005866 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867#endif
5868
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005869 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870 {NULL, NULL}
5871};
5872
Neil Schemenauerce30bc92002-11-18 16:10:18 +00005873static PyObject *
5874unicode_mod(PyObject *v, PyObject *w)
5875{
5876 if (!PyUnicode_Check(v)) {
5877 Py_INCREF(Py_NotImplemented);
5878 return Py_NotImplemented;
5879 }
5880 return PyUnicode_Format(v, w);
5881}
5882
5883static PyNumberMethods unicode_as_number = {
5884 0, /*nb_add*/
5885 0, /*nb_subtract*/
5886 0, /*nb_multiply*/
5887 0, /*nb_divide*/
5888 unicode_mod, /*nb_remainder*/
5889};
5890
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891static PySequenceMethods unicode_as_sequence = {
5892 (inquiry) unicode_length, /* sq_length */
5893 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5894 (intargfunc) unicode_repeat, /* sq_repeat */
5895 (intargfunc) unicode_getitem, /* sq_item */
5896 (intintargfunc) unicode_slice, /* sq_slice */
5897 0, /* sq_ass_item */
5898 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005899 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900};
5901
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005902static PyObject*
5903unicode_subscript(PyUnicodeObject* self, PyObject* item)
5904{
5905 if (PyInt_Check(item)) {
5906 long i = PyInt_AS_LONG(item);
5907 if (i < 0)
5908 i += PyString_GET_SIZE(self);
5909 return unicode_getitem(self, i);
5910 } else if (PyLong_Check(item)) {
5911 long i = PyLong_AsLong(item);
5912 if (i == -1 && PyErr_Occurred())
5913 return NULL;
5914 if (i < 0)
5915 i += PyString_GET_SIZE(self);
5916 return unicode_getitem(self, i);
5917 } else if (PySlice_Check(item)) {
5918 int start, stop, step, slicelength, cur, i;
5919 Py_UNICODE* source_buf;
5920 Py_UNICODE* result_buf;
5921 PyObject* result;
5922
5923 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5924 &start, &stop, &step, &slicelength) < 0) {
5925 return NULL;
5926 }
5927
5928 if (slicelength <= 0) {
5929 return PyUnicode_FromUnicode(NULL, 0);
5930 } else {
5931 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5932 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5933
5934 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5935 result_buf[i] = source_buf[cur];
5936 }
5937
5938 result = PyUnicode_FromUnicode(result_buf, slicelength);
5939 PyMem_FREE(result_buf);
5940 return result;
5941 }
5942 } else {
5943 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5944 return NULL;
5945 }
5946}
5947
5948static PyMappingMethods unicode_as_mapping = {
5949 (inquiry)unicode_length, /* mp_length */
5950 (binaryfunc)unicode_subscript, /* mp_subscript */
5951 (objobjargproc)0, /* mp_ass_subscript */
5952};
5953
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954static int
5955unicode_buffer_getreadbuf(PyUnicodeObject *self,
5956 int index,
5957 const void **ptr)
5958{
5959 if (index != 0) {
5960 PyErr_SetString(PyExc_SystemError,
5961 "accessing non-existent unicode segment");
5962 return -1;
5963 }
5964 *ptr = (void *) self->str;
5965 return PyUnicode_GET_DATA_SIZE(self);
5966}
5967
5968static int
5969unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5970 const void **ptr)
5971{
5972 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00005973 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 return -1;
5975}
5976
5977static int
5978unicode_buffer_getsegcount(PyUnicodeObject *self,
5979 int *lenp)
5980{
5981 if (lenp)
5982 *lenp = PyUnicode_GET_DATA_SIZE(self);
5983 return 1;
5984}
5985
5986static int
5987unicode_buffer_getcharbuf(PyUnicodeObject *self,
5988 int index,
5989 const void **ptr)
5990{
5991 PyObject *str;
5992
5993 if (index != 0) {
5994 PyErr_SetString(PyExc_SystemError,
5995 "accessing non-existent unicode segment");
5996 return -1;
5997 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005998 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 if (str == NULL)
6000 return -1;
6001 *ptr = (void *) PyString_AS_STRING(str);
6002 return PyString_GET_SIZE(str);
6003}
6004
6005/* Helpers for PyUnicode_Format() */
6006
6007static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006008getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009{
6010 int argidx = *p_argidx;
6011 if (argidx < arglen) {
6012 (*p_argidx)++;
6013 if (arglen < 0)
6014 return args;
6015 else
6016 return PyTuple_GetItem(args, argidx);
6017 }
6018 PyErr_SetString(PyExc_TypeError,
6019 "not enough arguments for format string");
6020 return NULL;
6021}
6022
6023#define F_LJUST (1<<0)
6024#define F_SIGN (1<<1)
6025#define F_BLANK (1<<2)
6026#define F_ALT (1<<3)
6027#define F_ZERO (1<<4)
6028
6029static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031{
6032 register int i;
6033 int len;
6034 va_list va;
6035 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037
6038 /* First, format the string as char array, then expand to Py_UNICODE
6039 array. */
6040 charbuffer = (char *)buffer;
6041 len = vsprintf(charbuffer, format, va);
6042 for (i = len - 1; i >= 0; i--)
6043 buffer[i] = (Py_UNICODE) charbuffer[i];
6044
6045 va_end(va);
6046 return len;
6047}
6048
Guido van Rossum078151d2002-08-11 04:24:12 +00006049/* XXX To save some code duplication, formatfloat/long/int could have been
6050 shared with stringobject.c, converting from 8-bit to Unicode after the
6051 formatting is done. */
6052
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053static int
6054formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006055 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 int flags,
6057 int prec,
6058 int type,
6059 PyObject *v)
6060{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006061 /* fmt = '%#.' + `prec` + `type`
6062 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 char fmt[20];
6064 double x;
6065
6066 x = PyFloat_AsDouble(v);
6067 if (x == -1.0 && PyErr_Occurred())
6068 return -1;
6069 if (prec < 0)
6070 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6072 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006073 /* Worst case length calc to ensure no buffer overrun:
6074
6075 'g' formats:
6076 fmt = %#.<prec>g
6077 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6078 for any double rep.)
6079 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6080
6081 'f' formats:
6082 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6083 len = 1 + 50 + 1 + prec = 52 + prec
6084
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006085 If prec=0 the effective precision is 1 (the leading digit is
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006086 always given), therefore increase the length by one.
6087
6088 */
6089 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6090 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006091 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006092 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006093 return -1;
6094 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006095 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6096 (flags&F_ALT) ? "#" : "",
6097 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 return usprintf(buf, fmt, x);
6099}
6100
Tim Peters38fd5b62000-09-21 05:43:11 +00006101static PyObject*
6102formatlong(PyObject *val, int flags, int prec, int type)
6103{
6104 char *buf;
6105 int i, len;
6106 PyObject *str; /* temporary string object. */
6107 PyUnicodeObject *result;
6108
6109 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6110 if (!str)
6111 return NULL;
6112 result = _PyUnicode_New(len);
6113 for (i = 0; i < len; i++)
6114 result->str[i] = buf[i];
6115 result->str[len] = 0;
6116 Py_DECREF(str);
6117 return (PyObject*)result;
6118}
6119
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120static int
6121formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006122 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 int flags,
6124 int prec,
6125 int type,
6126 PyObject *v)
6127{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006128 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006129 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6130 * + 1 + 1
6131 * = 24
6132 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006133 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 long x;
6135
6136 x = PyInt_AsLong(v);
6137 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006138 return -1;
Guido van Rossum078151d2002-08-11 04:24:12 +00006139 if (x < 0 && type != 'd' && type != 'i') {
Guido van Rossum54df53a2002-08-14 18:38:27 +00006140 if (PyErr_Warn(PyExc_FutureWarning,
Guido van Rossum078151d2002-08-11 04:24:12 +00006141 "%u/%o/%x/%X of negative int will return "
6142 "a signed string in Python 2.4 and up") < 0)
6143 return -1;
6144 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006146 prec = 1;
6147
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006148 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006149 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
6150 */
6151 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006152 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006153 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006154 return -1;
6155 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006156
6157 if ((flags & F_ALT) &&
6158 (type == 'x' || type == 'X')) {
6159 /* When converting under %#x or %#X, there are a number
6160 * of issues that cause pain:
6161 * - when 0 is being converted, the C standard leaves off
6162 * the '0x' or '0X', which is inconsistent with other
6163 * %#x/%#X conversions and inconsistent with Python's
6164 * hex() function
6165 * - there are platforms that violate the standard and
6166 * convert 0 with the '0x' or '0X'
6167 * (Metrowerks, Compaq Tru64)
6168 * - there are platforms that give '0x' when converting
6169 * under %#X, but convert 0 in accordance with the
6170 * standard (OS/2 EMX)
6171 *
6172 * We can achieve the desired consistency by inserting our
6173 * own '0x' or '0X' prefix, and substituting %x/%X in place
6174 * of %#x/%#X.
6175 *
6176 * Note that this is the same approach as used in
6177 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006178 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006179 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
6180 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006181 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006182 else {
6183 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
6184 (flags&F_ALT) ? "#" : "",
6185 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006186 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187 return usprintf(buf, fmt, x);
6188}
6189
6190static int
6191formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006192 size_t buflen,
6193 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006195 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006196 if (PyUnicode_Check(v)) {
6197 if (PyUnicode_GET_SIZE(v) != 1)
6198 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006200 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006202 else if (PyString_Check(v)) {
6203 if (PyString_GET_SIZE(v) != 1)
6204 goto onError;
6205 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6206 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207
6208 else {
6209 /* Integer input truncated to a character */
6210 long x;
6211 x = PyInt_AsLong(v);
6212 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006213 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006214#ifdef Py_UNICODE_WIDE
6215 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006216 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006217 "%c arg not in range(0x110000) "
6218 "(wide Python build)");
6219 return -1;
6220 }
6221#else
6222 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006223 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006224 "%c arg not in range(0x10000) "
6225 "(narrow Python build)");
6226 return -1;
6227 }
6228#endif
6229 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230 }
6231 buf[1] = '\0';
6232 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006233
6234 onError:
6235 PyErr_SetString(PyExc_TypeError,
6236 "%c requires int or char");
6237 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238}
6239
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006240/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6241
6242 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6243 chars are formatted. XXX This is a magic number. Each formatting
6244 routine does bounds checking to ensure no overflow, but a better
6245 solution may be to malloc a buffer of appropriate size for each
6246 format. For now, the current solution is sufficient.
6247*/
6248#define FORMATBUFLEN (size_t)120
6249
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250PyObject *PyUnicode_Format(PyObject *format,
6251 PyObject *args)
6252{
6253 Py_UNICODE *fmt, *res;
6254 int fmtcnt, rescnt, reslen, arglen, argidx;
6255 int args_owned = 0;
6256 PyUnicodeObject *result = NULL;
6257 PyObject *dict = NULL;
6258 PyObject *uformat;
6259
6260 if (format == NULL || args == NULL) {
6261 PyErr_BadInternalCall();
6262 return NULL;
6263 }
6264 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006265 if (uformat == NULL)
6266 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267 fmt = PyUnicode_AS_UNICODE(uformat);
6268 fmtcnt = PyUnicode_GET_SIZE(uformat);
6269
6270 reslen = rescnt = fmtcnt + 100;
6271 result = _PyUnicode_New(reslen);
6272 if (result == NULL)
6273 goto onError;
6274 res = PyUnicode_AS_UNICODE(result);
6275
6276 if (PyTuple_Check(args)) {
6277 arglen = PyTuple_Size(args);
6278 argidx = 0;
6279 }
6280 else {
6281 arglen = -1;
6282 argidx = -2;
6283 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006284 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6285 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 dict = args;
6287
6288 while (--fmtcnt >= 0) {
6289 if (*fmt != '%') {
6290 if (--rescnt < 0) {
6291 rescnt = fmtcnt + 100;
6292 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006293 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 return NULL;
6295 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6296 --rescnt;
6297 }
6298 *res++ = *fmt++;
6299 }
6300 else {
6301 /* Got a format specifier */
6302 int flags = 0;
6303 int width = -1;
6304 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 Py_UNICODE c = '\0';
6306 Py_UNICODE fill;
6307 PyObject *v = NULL;
6308 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006309 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310 Py_UNICODE sign;
6311 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006312 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313
6314 fmt++;
6315 if (*fmt == '(') {
6316 Py_UNICODE *keystart;
6317 int keylen;
6318 PyObject *key;
6319 int pcount = 1;
6320
6321 if (dict == NULL) {
6322 PyErr_SetString(PyExc_TypeError,
6323 "format requires a mapping");
6324 goto onError;
6325 }
6326 ++fmt;
6327 --fmtcnt;
6328 keystart = fmt;
6329 /* Skip over balanced parentheses */
6330 while (pcount > 0 && --fmtcnt >= 0) {
6331 if (*fmt == ')')
6332 --pcount;
6333 else if (*fmt == '(')
6334 ++pcount;
6335 fmt++;
6336 }
6337 keylen = fmt - keystart - 1;
6338 if (fmtcnt < 0 || pcount > 0) {
6339 PyErr_SetString(PyExc_ValueError,
6340 "incomplete format key");
6341 goto onError;
6342 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006343#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006344 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345 then looked up since Python uses strings to hold
6346 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006347 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348 key = PyUnicode_EncodeUTF8(keystart,
6349 keylen,
6350 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006351#else
6352 key = PyUnicode_FromUnicode(keystart, keylen);
6353#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354 if (key == NULL)
6355 goto onError;
6356 if (args_owned) {
6357 Py_DECREF(args);
6358 args_owned = 0;
6359 }
6360 args = PyObject_GetItem(dict, key);
6361 Py_DECREF(key);
6362 if (args == NULL) {
6363 goto onError;
6364 }
6365 args_owned = 1;
6366 arglen = -1;
6367 argidx = -2;
6368 }
6369 while (--fmtcnt >= 0) {
6370 switch (c = *fmt++) {
6371 case '-': flags |= F_LJUST; continue;
6372 case '+': flags |= F_SIGN; continue;
6373 case ' ': flags |= F_BLANK; continue;
6374 case '#': flags |= F_ALT; continue;
6375 case '0': flags |= F_ZERO; continue;
6376 }
6377 break;
6378 }
6379 if (c == '*') {
6380 v = getnextarg(args, arglen, &argidx);
6381 if (v == NULL)
6382 goto onError;
6383 if (!PyInt_Check(v)) {
6384 PyErr_SetString(PyExc_TypeError,
6385 "* wants int");
6386 goto onError;
6387 }
6388 width = PyInt_AsLong(v);
6389 if (width < 0) {
6390 flags |= F_LJUST;
6391 width = -width;
6392 }
6393 if (--fmtcnt >= 0)
6394 c = *fmt++;
6395 }
6396 else if (c >= '0' && c <= '9') {
6397 width = c - '0';
6398 while (--fmtcnt >= 0) {
6399 c = *fmt++;
6400 if (c < '0' || c > '9')
6401 break;
6402 if ((width*10) / 10 != width) {
6403 PyErr_SetString(PyExc_ValueError,
6404 "width too big");
6405 goto onError;
6406 }
6407 width = width*10 + (c - '0');
6408 }
6409 }
6410 if (c == '.') {
6411 prec = 0;
6412 if (--fmtcnt >= 0)
6413 c = *fmt++;
6414 if (c == '*') {
6415 v = getnextarg(args, arglen, &argidx);
6416 if (v == NULL)
6417 goto onError;
6418 if (!PyInt_Check(v)) {
6419 PyErr_SetString(PyExc_TypeError,
6420 "* wants int");
6421 goto onError;
6422 }
6423 prec = PyInt_AsLong(v);
6424 if (prec < 0)
6425 prec = 0;
6426 if (--fmtcnt >= 0)
6427 c = *fmt++;
6428 }
6429 else if (c >= '0' && c <= '9') {
6430 prec = c - '0';
6431 while (--fmtcnt >= 0) {
6432 c = Py_CHARMASK(*fmt++);
6433 if (c < '0' || c > '9')
6434 break;
6435 if ((prec*10) / 10 != prec) {
6436 PyErr_SetString(PyExc_ValueError,
6437 "prec too big");
6438 goto onError;
6439 }
6440 prec = prec*10 + (c - '0');
6441 }
6442 }
6443 } /* prec */
6444 if (fmtcnt >= 0) {
6445 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 if (--fmtcnt >= 0)
6447 c = *fmt++;
6448 }
6449 }
6450 if (fmtcnt < 0) {
6451 PyErr_SetString(PyExc_ValueError,
6452 "incomplete format");
6453 goto onError;
6454 }
6455 if (c != '%') {
6456 v = getnextarg(args, arglen, &argidx);
6457 if (v == NULL)
6458 goto onError;
6459 }
6460 sign = 0;
6461 fill = ' ';
6462 switch (c) {
6463
6464 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006465 pbuf = formatbuf;
6466 /* presume that buffer length is at least 1 */
6467 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468 len = 1;
6469 break;
6470
6471 case 's':
6472 case 'r':
6473 if (PyUnicode_Check(v) && c == 's') {
6474 temp = v;
6475 Py_INCREF(temp);
6476 }
6477 else {
6478 PyObject *unicode;
6479 if (c == 's')
6480 temp = PyObject_Str(v);
6481 else
6482 temp = PyObject_Repr(v);
6483 if (temp == NULL)
6484 goto onError;
6485 if (!PyString_Check(temp)) {
6486 /* XXX Note: this should never happen, since
6487 PyObject_Repr() and PyObject_Str() assure
6488 this */
6489 Py_DECREF(temp);
6490 PyErr_SetString(PyExc_TypeError,
6491 "%s argument has non-string str()");
6492 goto onError;
6493 }
Fred Drakee4315f52000-05-09 19:53:39 +00006494 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006496 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 "strict");
6498 Py_DECREF(temp);
6499 temp = unicode;
6500 if (temp == NULL)
6501 goto onError;
6502 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006503 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 len = PyUnicode_GET_SIZE(temp);
6505 if (prec >= 0 && len > prec)
6506 len = prec;
6507 break;
6508
6509 case 'i':
6510 case 'd':
6511 case 'u':
6512 case 'o':
6513 case 'x':
6514 case 'X':
6515 if (c == 'i')
6516 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006517 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006518 temp = formatlong(v, flags, prec, c);
6519 if (!temp)
6520 goto onError;
6521 pbuf = PyUnicode_AS_UNICODE(temp);
6522 len = PyUnicode_GET_SIZE(temp);
6523 /* unbounded ints can always produce
6524 a sign character! */
6525 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006527 else {
6528 pbuf = formatbuf;
6529 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6530 flags, prec, c, v);
6531 if (len < 0)
6532 goto onError;
6533 /* only d conversion is signed */
6534 sign = c == 'd';
6535 }
6536 if (flags & F_ZERO)
6537 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538 break;
6539
6540 case 'e':
6541 case 'E':
6542 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006543 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544 case 'g':
6545 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006546 if (c == 'F')
6547 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006548 pbuf = formatbuf;
6549 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6550 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 if (len < 0)
6552 goto onError;
6553 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006554 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555 fill = '0';
6556 break;
6557
6558 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006559 pbuf = formatbuf;
6560 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 if (len < 0)
6562 goto onError;
6563 break;
6564
6565 default:
6566 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006567 "unsupported format character '%c' (0x%x) "
6568 "at index %i",
Neal Norwitza0378e12002-09-13 13:47:06 +00006569 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006570 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006571 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 goto onError;
6573 }
6574 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006575 if (*pbuf == '-' || *pbuf == '+') {
6576 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 len--;
6578 }
6579 else if (flags & F_SIGN)
6580 sign = '+';
6581 else if (flags & F_BLANK)
6582 sign = ' ';
6583 else
6584 sign = 0;
6585 }
6586 if (width < len)
6587 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006588 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 reslen -= rescnt;
6590 rescnt = width + fmtcnt + 100;
6591 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006592 if (reslen < 0) {
6593 Py_DECREF(result);
6594 return PyErr_NoMemory();
6595 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006596 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597 return NULL;
6598 res = PyUnicode_AS_UNICODE(result)
6599 + reslen - rescnt;
6600 }
6601 if (sign) {
6602 if (fill != ' ')
6603 *res++ = sign;
6604 rescnt--;
6605 if (width > len)
6606 width--;
6607 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006608 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6609 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006610 assert(pbuf[1] == c);
6611 if (fill != ' ') {
6612 *res++ = *pbuf++;
6613 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00006614 }
Tim Petersfff53252001-04-12 18:38:48 +00006615 rescnt -= 2;
6616 width -= 2;
6617 if (width < 0)
6618 width = 0;
6619 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00006620 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621 if (width > len && !(flags & F_LJUST)) {
6622 do {
6623 --rescnt;
6624 *res++ = fill;
6625 } while (--width > len);
6626 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006627 if (fill == ' ') {
6628 if (sign)
6629 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00006630 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006631 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006632 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00006633 *res++ = *pbuf++;
6634 *res++ = *pbuf++;
6635 }
6636 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006637 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638 res += len;
6639 rescnt -= len;
6640 while (--width >= len) {
6641 --rescnt;
6642 *res++ = ' ';
6643 }
6644 if (dict && (argidx < arglen) && c != '%') {
6645 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006646 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 goto onError;
6648 }
6649 Py_XDECREF(temp);
6650 } /* '%' */
6651 } /* until end */
6652 if (argidx < arglen && !dict) {
6653 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006654 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655 goto onError;
6656 }
6657
6658 if (args_owned) {
6659 Py_DECREF(args);
6660 }
6661 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006662 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006663 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 return (PyObject *)result;
6665
6666 onError:
6667 Py_XDECREF(result);
6668 Py_DECREF(uformat);
6669 if (args_owned) {
6670 Py_DECREF(args);
6671 }
6672 return NULL;
6673}
6674
6675static PyBufferProcs unicode_as_buffer = {
6676 (getreadbufferproc) unicode_buffer_getreadbuf,
6677 (getwritebufferproc) unicode_buffer_getwritebuf,
6678 (getsegcountproc) unicode_buffer_getsegcount,
6679 (getcharbufferproc) unicode_buffer_getcharbuf,
6680};
6681
Jeremy Hylton938ace62002-07-17 16:30:39 +00006682static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00006683unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6684
Tim Peters6d6c1a32001-08-02 04:15:00 +00006685static PyObject *
6686unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6687{
6688 PyObject *x = NULL;
6689 static char *kwlist[] = {"string", "encoding", "errors", 0};
6690 char *encoding = NULL;
6691 char *errors = NULL;
6692
Guido van Rossume023fe02001-08-30 03:12:59 +00006693 if (type != &PyUnicode_Type)
6694 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00006695 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6696 kwlist, &x, &encoding, &errors))
6697 return NULL;
6698 if (x == NULL)
6699 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00006700 if (encoding == NULL && errors == NULL)
6701 return PyObject_Unicode(x);
6702 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00006703 return PyUnicode_FromEncodedObject(x, encoding, errors);
6704}
6705
Guido van Rossume023fe02001-08-30 03:12:59 +00006706static PyObject *
6707unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6708{
Tim Petersaf90b3e2001-09-12 05:18:58 +00006709 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006710 int n;
6711
6712 assert(PyType_IsSubtype(type, &PyUnicode_Type));
6713 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6714 if (tmp == NULL)
6715 return NULL;
6716 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00006717 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006718 if (pnew == NULL) {
6719 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00006720 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00006721 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006722 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6723 if (pnew->str == NULL) {
6724 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006725 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006726 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00006727 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00006728 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006729 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6730 pnew->length = n;
6731 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00006732 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00006733 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006734}
6735
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006736PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00006737"unicode(string [, encoding[, errors]]) -> object\n\
6738\n\
6739Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00006740encoding defaults to the current default string encoding.\n\
6741errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00006742
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743PyTypeObject PyUnicode_Type = {
6744 PyObject_HEAD_INIT(&PyType_Type)
6745 0, /* ob_size */
6746 "unicode", /* tp_name */
6747 sizeof(PyUnicodeObject), /* tp_size */
6748 0, /* tp_itemsize */
6749 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00006750 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006752 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 0, /* tp_setattr */
6754 (cmpfunc) unicode_compare, /* tp_compare */
6755 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006756 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006758 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759 (hashfunc) unicode_hash, /* tp_hash*/
6760 0, /* tp_call*/
6761 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006762 PyObject_GenericGetAttr, /* tp_getattro */
6763 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006765 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
6766 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006767 unicode_doc, /* tp_doc */
6768 0, /* tp_traverse */
6769 0, /* tp_clear */
6770 0, /* tp_richcompare */
6771 0, /* tp_weaklistoffset */
6772 0, /* tp_iter */
6773 0, /* tp_iternext */
6774 unicode_methods, /* tp_methods */
6775 0, /* tp_members */
6776 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00006777 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006778 0, /* tp_dict */
6779 0, /* tp_descr_get */
6780 0, /* tp_descr_set */
6781 0, /* tp_dictoffset */
6782 0, /* tp_init */
6783 0, /* tp_alloc */
6784 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006785 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786};
6787
6788/* Initialize the Unicode implementation */
6789
Thomas Wouters78890102000-07-22 19:25:51 +00006790void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006792 int i;
6793
Fred Drakee4315f52000-05-09 19:53:39 +00006794 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006795 unicode_freelist = NULL;
6796 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00006798 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006799 for (i = 0; i < 256; i++)
6800 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00006801 if (PyType_Ready(&PyUnicode_Type) < 0)
6802 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803}
6804
6805/* Finalize the Unicode implementation */
6806
6807void
Thomas Wouters78890102000-07-22 19:25:51 +00006808_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006810 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006811 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00006813 Py_XDECREF(unicode_empty);
6814 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006815
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006816 for (i = 0; i < 256; i++) {
6817 if (unicode_latin1[i]) {
6818 Py_DECREF(unicode_latin1[i]);
6819 unicode_latin1[i] = NULL;
6820 }
6821 }
6822
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006823 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824 PyUnicodeObject *v = u;
6825 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006826 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00006827 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006828 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006829 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006831 unicode_freelist = NULL;
6832 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006834
6835/*
6836Local variables:
6837c-basic-offset: 4
6838indent-tabs-mode: nil
6839End:
6840*/