blob: c56ef9fa27f7e01e4785b91294335dc869c657bd [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000222 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000279 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000280 *unicode = (PyObject *)w;
281 return 0;
282 }
283
284 /* Note that we don't have to modify *unicode for unshared Unicode
285 objects, since we can modify them in-place. */
286 return unicode_resize(v, length);
287}
288
289/* Internal API for use in unicodeobject.c only ! */
290#define _PyUnicode_Resize(unicodevar, length) \
291 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
292
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
294 int size)
295{
296 PyUnicodeObject *unicode;
297
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000298 /* If the Unicode data is known at construction time, we can apply
299 some optimizations which share commonly used objects. */
300 if (u != NULL) {
301
302 /* Optimization for empty strings */
303 if (size == 0 && unicode_empty != NULL) {
304 Py_INCREF(unicode_empty);
305 return (PyObject *)unicode_empty;
306 }
307
308 /* Single character Unicode objects in the Latin-1 range are
309 shared when using this constructor */
310 if (size == 1 && *u < 256) {
311 unicode = unicode_latin1[*u];
312 if (!unicode) {
313 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000314 if (!unicode)
315 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000316 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000317 unicode_latin1[*u] = unicode;
318 }
319 Py_INCREF(unicode);
320 return (PyObject *)unicode;
321 }
322 }
323
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 unicode = _PyUnicode_New(size);
325 if (!unicode)
326 return NULL;
327
328 /* Copy the Unicode data into the new object */
329 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331
332 return (PyObject *)unicode;
333}
334
335#ifdef HAVE_WCHAR_H
336
337PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
338 int size)
339{
340 PyUnicodeObject *unicode;
341
342 if (w == NULL) {
343 PyErr_BadInternalCall();
344 return NULL;
345 }
346
347 unicode = _PyUnicode_New(size);
348 if (!unicode)
349 return NULL;
350
351 /* Copy the wchar_t data into the new object */
352#ifdef HAVE_USABLE_WCHAR_T
353 memcpy(unicode->str, w, size * sizeof(wchar_t));
354#else
355 {
356 register Py_UNICODE *u;
357 register int i;
358 u = PyUnicode_AS_UNICODE(unicode);
359 for (i = size; i >= 0; i--)
360 *u++ = *w++;
361 }
362#endif
363
364 return (PyObject *)unicode;
365}
366
367int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
368 register wchar_t *w,
369 int size)
370{
371 if (unicode == NULL) {
372 PyErr_BadInternalCall();
373 return -1;
374 }
375 if (size > PyUnicode_GET_SIZE(unicode))
376 size = PyUnicode_GET_SIZE(unicode);
377#ifdef HAVE_USABLE_WCHAR_T
378 memcpy(w, unicode->str, size * sizeof(wchar_t));
379#else
380 {
381 register Py_UNICODE *u;
382 register int i;
383 u = PyUnicode_AS_UNICODE(unicode);
384 for (i = size; i >= 0; i--)
385 *w++ = *u++;
386 }
387#endif
388
389 return size;
390}
391
392#endif
393
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000394PyObject *PyUnicode_FromOrdinal(int ordinal)
395{
396 Py_UNICODE s[2];
397
398#ifdef Py_UNICODE_WIDE
399 if (ordinal < 0 || ordinal > 0x10ffff) {
400 PyErr_SetString(PyExc_ValueError,
401 "unichr() arg not in range(0x110000) "
402 "(wide Python build)");
403 return NULL;
404 }
405#else
406 if (ordinal < 0 || ordinal > 0xffff) {
407 PyErr_SetString(PyExc_ValueError,
408 "unichr() arg not in range(0x10000) "
409 "(narrow Python build)");
410 return NULL;
411 }
412#endif
413
414 if (ordinal <= 0xffff) {
415 /* UCS-2 character */
416 s[0] = (Py_UNICODE) ordinal;
417 return PyUnicode_FromUnicode(s, 1);
418 }
419 else {
420#ifndef Py_UNICODE_WIDE
421 /* UCS-4 character. store as two surrogate characters */
422 ordinal -= 0x10000L;
423 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
424 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
425 return PyUnicode_FromUnicode(s, 2);
426#else
427 s[0] = (Py_UNICODE)ordinal;
428 return PyUnicode_FromUnicode(s, 1);
429#endif
430 }
431}
432
Guido van Rossumd57fd912000-03-10 22:53:23 +0000433PyObject *PyUnicode_FromObject(register PyObject *obj)
434{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000435 /* XXX Perhaps we should make this API an alias of
436 PyObject_Unicode() instead ?! */
437 if (PyUnicode_CheckExact(obj)) {
438 Py_INCREF(obj);
439 return obj;
440 }
441 if (PyUnicode_Check(obj)) {
442 /* For a Unicode subtype that's not a Unicode object,
443 return a true Unicode object with the same data. */
444 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
445 PyUnicode_GET_SIZE(obj));
446 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
448}
449
450PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
451 const char *encoding,
452 const char *errors)
453{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000454 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000456 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457
458 if (obj == NULL) {
459 PyErr_BadInternalCall();
460 return NULL;
461 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000462
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000463#if 0
464 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000465 that no encodings is given and then redirect to
466 PyObject_Unicode() which then applies the additional logic for
467 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000468
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000469 NOTE: This API should really only be used for object which
470 represent *encoded* Unicode !
471
472 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000473 if (PyUnicode_Check(obj)) {
474 if (encoding) {
475 PyErr_SetString(PyExc_TypeError,
476 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000477 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000478 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000479 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000480 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000481#else
482 if (PyUnicode_Check(obj)) {
483 PyErr_SetString(PyExc_TypeError,
484 "decoding Unicode is not supported");
485 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000486 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000487#endif
488
489 /* Coerce object */
490 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000491 s = PyString_AS_STRING(obj);
492 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000493 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000494 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
495 /* Overwrite the error message with something more useful in
496 case of a TypeError. */
497 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000498 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000499 "coercing to Unicode: need string or buffer, "
500 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000501 obj->ob_type->tp_name);
502 goto onError;
503 }
504
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000505 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 if (len == 0) {
507 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000510 else
511 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000512
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000513 return v;
514
515 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000517}
518
519PyObject *PyUnicode_Decode(const char *s,
520 int size,
521 const char *encoding,
522 const char *errors)
523{
524 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000525
526 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000527 encoding = PyUnicode_GetDefaultEncoding();
528
529 /* Shortcuts for common default encodings */
530 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000531 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000532 else if (strcmp(encoding, "latin-1") == 0)
533 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000534#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
535 else if (strcmp(encoding, "mbcs") == 0)
536 return PyUnicode_DecodeMBCS(s, size, errors);
537#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000538 else if (strcmp(encoding, "ascii") == 0)
539 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000540
541 /* Decode via the codec registry */
542 buffer = PyBuffer_FromMemory((void *)s, size);
543 if (buffer == NULL)
544 goto onError;
545 unicode = PyCodec_Decode(buffer, encoding, errors);
546 if (unicode == NULL)
547 goto onError;
548 if (!PyUnicode_Check(unicode)) {
549 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000550 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000551 unicode->ob_type->tp_name);
552 Py_DECREF(unicode);
553 goto onError;
554 }
555 Py_DECREF(buffer);
556 return unicode;
557
558 onError:
559 Py_XDECREF(buffer);
560 return NULL;
561}
562
563PyObject *PyUnicode_Encode(const Py_UNICODE *s,
564 int size,
565 const char *encoding,
566 const char *errors)
567{
568 PyObject *v, *unicode;
569
570 unicode = PyUnicode_FromUnicode(s, size);
571 if (unicode == NULL)
572 return NULL;
573 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
574 Py_DECREF(unicode);
575 return v;
576}
577
578PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
579 const char *encoding,
580 const char *errors)
581{
582 PyObject *v;
583
584 if (!PyUnicode_Check(unicode)) {
585 PyErr_BadArgument();
586 goto onError;
587 }
Fred Drakee4315f52000-05-09 19:53:39 +0000588
589 if (encoding == NULL)
590 encoding = PyUnicode_GetDefaultEncoding();
591
592 /* Shortcuts for common default encodings */
593 if (errors == NULL) {
594 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000595 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000596 else if (strcmp(encoding, "latin-1") == 0)
597 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000598#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
599 else if (strcmp(encoding, "mbcs") == 0)
600 return PyUnicode_AsMBCSString(unicode);
601#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000602 else if (strcmp(encoding, "ascii") == 0)
603 return PyUnicode_AsASCIIString(unicode);
604 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000605
606 /* Encode via the codec registry */
607 v = PyCodec_Encode(unicode, encoding, errors);
608 if (v == NULL)
609 goto onError;
610 /* XXX Should we really enforce this ? */
611 if (!PyString_Check(v)) {
612 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000613 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 v->ob_type->tp_name);
615 Py_DECREF(v);
616 goto onError;
617 }
618 return v;
619
620 onError:
621 return NULL;
622}
623
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000624PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
625 const char *errors)
626{
627 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
628
629 if (v)
630 return v;
631 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
632 if (v && errors == NULL)
633 ((PyUnicodeObject *)unicode)->defenc = v;
634 return v;
635}
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
638{
639 if (!PyUnicode_Check(unicode)) {
640 PyErr_BadArgument();
641 goto onError;
642 }
643 return PyUnicode_AS_UNICODE(unicode);
644
645 onError:
646 return NULL;
647}
648
649int PyUnicode_GetSize(PyObject *unicode)
650{
651 if (!PyUnicode_Check(unicode)) {
652 PyErr_BadArgument();
653 goto onError;
654 }
655 return PyUnicode_GET_SIZE(unicode);
656
657 onError:
658 return -1;
659}
660
Thomas Wouters78890102000-07-22 19:25:51 +0000661const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000662{
663 return unicode_default_encoding;
664}
665
666int PyUnicode_SetDefaultEncoding(const char *encoding)
667{
668 PyObject *v;
669
670 /* Make sure the encoding is valid. As side effect, this also
671 loads the encoding into the codec registry cache. */
672 v = _PyCodec_Lookup(encoding);
673 if (v == NULL)
674 goto onError;
675 Py_DECREF(v);
676 strncpy(unicode_default_encoding,
677 encoding,
678 sizeof(unicode_default_encoding));
679 return 0;
680
681 onError:
682 return -1;
683}
684
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000685/* error handling callback helper:
686 build arguments, call the callback and check the arguments,
687 if no exception occured, copy the replacement to the output
688 and adjust various state variables.
689 return 0 on success, -1 on error
690*/
691
692static
693int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
694 const char *encoding, const char *reason,
695 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
696 PyObject **output, int *outpos, Py_UNICODE **outptr)
697{
698 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
699
700 PyObject *restuple = NULL;
701 PyObject *repunicode = NULL;
702 int outsize = PyUnicode_GET_SIZE(*output);
703 int requiredsize;
704 int newpos;
705 Py_UNICODE *repptr;
706 int repsize;
707 int res = -1;
708
709 if (*errorHandler == NULL) {
710 *errorHandler = PyCodec_LookupError(errors);
711 if (*errorHandler == NULL)
712 goto onError;
713 }
714
715 if (*exceptionObject == NULL) {
716 *exceptionObject = PyUnicodeDecodeError_Create(
717 encoding, input, insize, *startinpos, *endinpos, reason);
718 if (*exceptionObject == NULL)
719 goto onError;
720 }
721 else {
722 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
723 goto onError;
724 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
725 goto onError;
726 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
727 goto onError;
728 }
729
730 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
731 if (restuple == NULL)
732 goto onError;
733 if (!PyTuple_Check(restuple)) {
734 PyErr_Format(PyExc_TypeError, &argparse[4]);
735 goto onError;
736 }
737 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
738 goto onError;
739 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000740 newpos = insize+newpos;
741 if (newpos<0 || newpos>insize) {
742 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
743 goto onError;
744 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000745
746 /* need more space? (at least enough for what we
747 have+the replacement+the rest of the string (starting
748 at the new input position), so we won't have to check space
749 when there are no errors in the rest of the string) */
750 repptr = PyUnicode_AS_UNICODE(repunicode);
751 repsize = PyUnicode_GET_SIZE(repunicode);
752 requiredsize = *outpos + repsize + insize-newpos;
753 if (requiredsize > outsize) {
754 if (requiredsize<2*outsize)
755 requiredsize = 2*outsize;
756 if (PyUnicode_Resize(output, requiredsize))
757 goto onError;
758 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
759 }
760 *endinpos = newpos;
761 *inptr = input + newpos;
762 Py_UNICODE_COPY(*outptr, repptr, repsize);
763 *outptr += repsize;
764 *outpos += repsize;
765 /* we made it! */
766 res = 0;
767
768 onError:
769 Py_XDECREF(restuple);
770 return res;
771}
772
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000773/* --- UTF-7 Codec -------------------------------------------------------- */
774
775/* see RFC2152 for details */
776
777static
778char utf7_special[128] = {
779 /* indicate whether a UTF-7 character is special i.e. cannot be directly
780 encoded:
781 0 - not special
782 1 - special
783 2 - whitespace (optional)
784 3 - RFC2152 Set O (optional) */
785 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
786 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
787 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
788 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
789 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
790 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
791 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
792 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
793
794};
795
796#define SPECIAL(c, encodeO, encodeWS) \
797 (((c)>127 || utf7_special[(c)] == 1) || \
798 (encodeWS && (utf7_special[(c)] == 2)) || \
799 (encodeO && (utf7_special[(c)] == 3)))
800
801#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
802#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
803#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
804 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
805
806#define ENCODE(out, ch, bits) \
807 while (bits >= 6) { \
808 *out++ = B64(ch >> (bits-6)); \
809 bits -= 6; \
810 }
811
812#define DECODE(out, ch, bits, surrogate) \
813 while (bits >= 16) { \
814 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
815 bits -= 16; \
816 if (surrogate) { \
817 /* We have already generated an error for the high surrogate
818 so let's not bother seeing if the low surrogate is correct or not */\
819 surrogate = 0; \
820 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
821 /* This is a surrogate pair. Unfortunately we can't represent \
822 it in a 16-bit character */ \
823 surrogate = 1; \
824 errmsg = "code pairs are not supported"; \
825 goto utf7Error; \
826 } else { \
827 *out++ = outCh; \
828 } \
829 } \
830
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000831PyObject *PyUnicode_DecodeUTF7(const char *s,
832 int size,
833 const char *errors)
834{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000835 const char *starts = s;
836 int startinpos;
837 int endinpos;
838 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000839 const char *e;
840 PyUnicodeObject *unicode;
841 Py_UNICODE *p;
842 const char *errmsg = "";
843 int inShift = 0;
844 unsigned int bitsleft = 0;
845 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000846 int surrogate = 0;
847 PyObject *errorHandler = NULL;
848 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000849
850 unicode = _PyUnicode_New(size);
851 if (!unicode)
852 return NULL;
853 if (size == 0)
854 return (PyObject *)unicode;
855
856 p = unicode->str;
857 e = s + size;
858
859 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000860 Py_UNICODE ch;
861 restart:
862 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000863
864 if (inShift) {
865 if ((ch == '-') || !B64CHAR(ch)) {
866 inShift = 0;
867 s++;
868
869 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
870 if (bitsleft >= 6) {
871 /* The shift sequence has a partial character in it. If
872 bitsleft < 6 then we could just classify it as padding
873 but that is not the case here */
874
875 errmsg = "partial character in shift sequence";
876 goto utf7Error;
877 }
878 /* According to RFC2152 the remaining bits should be zero. We
879 choose to signal an error/insert a replacement character
880 here so indicate the potential of a misencoded character. */
881
882 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
883 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
884 errmsg = "non-zero padding bits in shift sequence";
885 goto utf7Error;
886 }
887
888 if (ch == '-') {
889 if ((s < e) && (*(s) == '-')) {
890 *p++ = '-';
891 inShift = 1;
892 }
893 } else if (SPECIAL(ch,0,0)) {
894 errmsg = "unexpected special character";
895 goto utf7Error;
896 } else {
897 *p++ = ch;
898 }
899 } else {
900 charsleft = (charsleft << 6) | UB64(ch);
901 bitsleft += 6;
902 s++;
903 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
904 }
905 }
906 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000907 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000908 s++;
909 if (s < e && *s == '-') {
910 s++;
911 *p++ = '+';
912 } else
913 {
914 inShift = 1;
915 bitsleft = 0;
916 }
917 }
918 else if (SPECIAL(ch,0,0)) {
919 errmsg = "unexpected special character";
920 s++;
921 goto utf7Error;
922 }
923 else {
924 *p++ = ch;
925 s++;
926 }
927 continue;
928 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000929 outpos = p-PyUnicode_AS_UNICODE(unicode);
930 endinpos = s-starts;
931 if (unicode_decode_call_errorhandler(
932 errors, &errorHandler,
933 "utf7", errmsg,
934 starts, size, &startinpos, &endinpos, &exc, &s,
935 (PyObject **)&unicode, &outpos, &p))
936 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000937 }
938
939 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000940 outpos = p-PyUnicode_AS_UNICODE(unicode);
941 endinpos = size;
942 if (unicode_decode_call_errorhandler(
943 errors, &errorHandler,
944 "utf7", "unterminated shift sequence",
945 starts, size, &startinpos, &endinpos, &exc, &s,
946 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000947 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000948 if (s < e)
949 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950 }
951
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000952 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000953 goto onError;
954
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000955 Py_XDECREF(errorHandler);
956 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000957 return (PyObject *)unicode;
958
959onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000960 Py_XDECREF(errorHandler);
961 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000962 Py_DECREF(unicode);
963 return NULL;
964}
965
966
967PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
968 int size,
969 int encodeSetO,
970 int encodeWhiteSpace,
971 const char *errors)
972{
973 PyObject *v;
974 /* It might be possible to tighten this worst case */
975 unsigned int cbAllocated = 5 * size;
976 int inShift = 0;
977 int i = 0;
978 unsigned int bitsleft = 0;
979 unsigned long charsleft = 0;
980 char * out;
981 char * start;
982
983 if (size == 0)
984 return PyString_FromStringAndSize(NULL, 0);
985
986 v = PyString_FromStringAndSize(NULL, cbAllocated);
987 if (v == NULL)
988 return NULL;
989
990 start = out = PyString_AS_STRING(v);
991 for (;i < size; ++i) {
992 Py_UNICODE ch = s[i];
993
994 if (!inShift) {
995 if (ch == '+') {
996 *out++ = '+';
997 *out++ = '-';
998 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
999 charsleft = ch;
1000 bitsleft = 16;
1001 *out++ = '+';
1002 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1003 inShift = bitsleft > 0;
1004 } else {
1005 *out++ = (char) ch;
1006 }
1007 } else {
1008 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1009 *out++ = B64(charsleft << (6-bitsleft));
1010 charsleft = 0;
1011 bitsleft = 0;
1012 /* Characters not in the BASE64 set implicitly unshift the sequence
1013 so no '-' is required, except if the character is itself a '-' */
1014 if (B64CHAR(ch) || ch == '-') {
1015 *out++ = '-';
1016 }
1017 inShift = 0;
1018 *out++ = (char) ch;
1019 } else {
1020 bitsleft += 16;
1021 charsleft = (charsleft << 16) | ch;
1022 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1023
1024 /* If the next character is special then we dont' need to terminate
1025 the shift sequence. If the next character is not a BASE64 character
1026 or '-' then the shift sequence will be terminated implicitly and we
1027 don't have to insert a '-'. */
1028
1029 if (bitsleft == 0) {
1030 if (i + 1 < size) {
1031 Py_UNICODE ch2 = s[i+1];
1032
1033 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1034
1035 } else if (B64CHAR(ch2) || ch2 == '-') {
1036 *out++ = '-';
1037 inShift = 0;
1038 } else {
1039 inShift = 0;
1040 }
1041
1042 }
1043 else {
1044 *out++ = '-';
1045 inShift = 0;
1046 }
1047 }
1048 }
1049 }
1050 }
1051 if (bitsleft) {
1052 *out++= B64(charsleft << (6-bitsleft) );
1053 *out++ = '-';
1054 }
1055
Tim Peters5de98422002-04-27 18:44:32 +00001056 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001057 return v;
1058}
1059
1060#undef SPECIAL
1061#undef B64
1062#undef B64CHAR
1063#undef UB64
1064#undef ENCODE
1065#undef DECODE
1066
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067/* --- UTF-8 Codec -------------------------------------------------------- */
1068
1069static
1070char utf8_code_length[256] = {
1071 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1072 illegal prefix. see RFC 2279 for details */
1073 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1074 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1075 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1076 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1077 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1078 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1079 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1080 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1081 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1082 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1083 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1084 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1085 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1086 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1087 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1088 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1089};
1090
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091PyObject *PyUnicode_DecodeUTF8(const char *s,
1092 int size,
1093 const char *errors)
1094{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001095 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001097 int startinpos;
1098 int endinpos;
1099 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100 const char *e;
1101 PyUnicodeObject *unicode;
1102 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001103 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001104 PyObject *errorHandler = NULL;
1105 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001106
1107 /* Note: size will always be longer than the resulting Unicode
1108 character count */
1109 unicode = _PyUnicode_New(size);
1110 if (!unicode)
1111 return NULL;
1112 if (size == 0)
1113 return (PyObject *)unicode;
1114
1115 /* Unpack UTF-8 encoded data */
1116 p = unicode->str;
1117 e = s + size;
1118
1119 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001120 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121
1122 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001123 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001124 s++;
1125 continue;
1126 }
1127
1128 n = utf8_code_length[ch];
1129
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001130 if (s + n > e) {
1131 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001132 startinpos = s-starts;
1133 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001134 goto utf8Error;
1135 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001136
1137 switch (n) {
1138
1139 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001140 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001141 startinpos = s-starts;
1142 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001143 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144
1145 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001146 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001147 startinpos = s-starts;
1148 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001149 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150
1151 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001152 if ((s[1] & 0xc0) != 0x80) {
1153 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001154 startinpos = s-starts;
1155 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001156 goto utf8Error;
1157 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001159 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001160 startinpos = s-starts;
1161 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001162 errmsg = "illegal encoding";
1163 goto utf8Error;
1164 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001166 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 break;
1168
1169 case 3:
1170 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001171 (s[2] & 0xc0) != 0x80) {
1172 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001173 startinpos = s-starts;
1174 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001175 goto utf8Error;
1176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001178 if (ch < 0x0800) {
1179 /* Note: UTF-8 encodings of surrogates are considered
1180 legal UTF-8 sequences;
1181
1182 XXX For wide builds (UCS-4) we should probably try
1183 to recombine the surrogates into a single code
1184 unit.
1185 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001186 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001187 startinpos = s-starts;
1188 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001189 goto utf8Error;
1190 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001192 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001193 break;
1194
1195 case 4:
1196 if ((s[1] & 0xc0) != 0x80 ||
1197 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001198 (s[3] & 0xc0) != 0x80) {
1199 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001200 startinpos = s-starts;
1201 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001202 goto utf8Error;
1203 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001204 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1205 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1206 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001207 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001208 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001209 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001210 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001211 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001212 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001213 startinpos = s-starts;
1214 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001215 goto utf8Error;
1216 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001217#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001218 *p++ = (Py_UNICODE)ch;
1219#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001220 /* compute and append the two surrogates: */
1221
1222 /* translate from 10000..10FFFF to 0..FFFF */
1223 ch -= 0x10000;
1224
1225 /* high surrogate = top 10 bits added to D800 */
1226 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1227
1228 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001229 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001230#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 break;
1232
1233 default:
1234 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001235 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001236 startinpos = s-starts;
1237 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001238 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 }
1240 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001241 continue;
1242
1243 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001244 outpos = p-PyUnicode_AS_UNICODE(unicode);
1245 if (unicode_decode_call_errorhandler(
1246 errors, &errorHandler,
1247 "utf8", errmsg,
1248 starts, size, &startinpos, &endinpos, &exc, &s,
1249 (PyObject **)&unicode, &outpos, &p))
1250 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 }
1252
1253 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001254 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255 goto onError;
1256
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001257 Py_XDECREF(errorHandler);
1258 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001259 return (PyObject *)unicode;
1260
1261onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001262 Py_XDECREF(errorHandler);
1263 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 Py_DECREF(unicode);
1265 return NULL;
1266}
1267
Tim Peters602f7402002-04-27 18:03:26 +00001268/* Allocation strategy: if the string is short, convert into a stack buffer
1269 and allocate exactly as much space needed at the end. Else allocate the
1270 maximum possible needed (4 result bytes per Unicode character), and return
1271 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001272*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001273PyObject *
1274PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1275 int size,
1276 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277{
Tim Peters602f7402002-04-27 18:03:26 +00001278#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001279
Tim Peters602f7402002-04-27 18:03:26 +00001280 int i; /* index into s of next input byte */
1281 PyObject *v; /* result string object */
1282 char *p; /* next free byte in output buffer */
1283 int nallocated; /* number of result bytes allocated */
1284 int nneeded; /* number of result bytes needed */
1285 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001286
Tim Peters602f7402002-04-27 18:03:26 +00001287 assert(s != NULL);
1288 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289
Tim Peters602f7402002-04-27 18:03:26 +00001290 if (size <= MAX_SHORT_UNICHARS) {
1291 /* Write into the stack buffer; nallocated can't overflow.
1292 * At the end, we'll allocate exactly as much heap space as it
1293 * turns out we need.
1294 */
1295 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1296 v = NULL; /* will allocate after we're done */
1297 p = stackbuf;
1298 }
1299 else {
1300 /* Overallocate on the heap, and give the excess back at the end. */
1301 nallocated = size * 4;
1302 if (nallocated / 4 != size) /* overflow! */
1303 return PyErr_NoMemory();
1304 v = PyString_FromStringAndSize(NULL, nallocated);
1305 if (v == NULL)
1306 return NULL;
1307 p = PyString_AS_STRING(v);
1308 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001309
Tim Peters602f7402002-04-27 18:03:26 +00001310 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001311 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001312
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001313 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001314 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001315 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001316
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001318 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001319 *p++ = (char)(0xc0 | (ch >> 6));
1320 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001321 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001322 else {
Tim Peters602f7402002-04-27 18:03:26 +00001323 /* Encode UCS2 Unicode ordinals */
1324 if (ch < 0x10000) {
1325 /* Special case: check for high surrogate */
1326 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1327 Py_UCS4 ch2 = s[i];
1328 /* Check for low surrogate and combine the two to
1329 form a UCS4 value */
1330 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001331 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001332 i++;
1333 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001334 }
Tim Peters602f7402002-04-27 18:03:26 +00001335 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001336 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001337 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001338 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1339 *p++ = (char)(0x80 | (ch & 0x3f));
1340 continue;
1341 }
1342encodeUCS4:
1343 /* Encode UCS4 Unicode ordinals */
1344 *p++ = (char)(0xf0 | (ch >> 18));
1345 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1346 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1347 *p++ = (char)(0x80 | (ch & 0x3f));
1348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001350
Tim Peters602f7402002-04-27 18:03:26 +00001351 if (v == NULL) {
1352 /* This was stack allocated. */
1353 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1354 assert(nneeded <= nallocated);
1355 v = PyString_FromStringAndSize(stackbuf, nneeded);
1356 }
1357 else {
1358 /* Cut back to size actually needed. */
1359 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1360 assert(nneeded <= nallocated);
1361 _PyString_Resize(&v, nneeded);
1362 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001363 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001364
Tim Peters602f7402002-04-27 18:03:26 +00001365#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366}
1367
Guido van Rossumd57fd912000-03-10 22:53:23 +00001368PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1369{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370 if (!PyUnicode_Check(unicode)) {
1371 PyErr_BadArgument();
1372 return NULL;
1373 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001374 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1375 PyUnicode_GET_SIZE(unicode),
1376 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377}
1378
1379/* --- UTF-16 Codec ------------------------------------------------------- */
1380
Tim Peters772747b2001-08-09 22:21:55 +00001381PyObject *
1382PyUnicode_DecodeUTF16(const char *s,
1383 int size,
1384 const char *errors,
1385 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001386{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001387 const char *starts = s;
1388 int startinpos;
1389 int endinpos;
1390 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001391 PyUnicodeObject *unicode;
1392 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001393 const unsigned char *q, *e;
1394 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001395 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001396 /* Offsets from q for retrieving byte pairs in the right order. */
1397#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1398 int ihi = 1, ilo = 0;
1399#else
1400 int ihi = 0, ilo = 1;
1401#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001402 PyObject *errorHandler = NULL;
1403 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404
1405 /* Note: size will always be longer than the resulting Unicode
1406 character count */
1407 unicode = _PyUnicode_New(size);
1408 if (!unicode)
1409 return NULL;
1410 if (size == 0)
1411 return (PyObject *)unicode;
1412
1413 /* Unpack UTF-16 encoded data */
1414 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001415 q = (unsigned char *)s;
1416 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417
1418 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001419 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001420
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001421 /* Check for BOM marks (U+FEFF) in the input and adjust current
1422 byte order setting accordingly. In native mode, the leading BOM
1423 mark is skipped, in all other modes, it is copied to the output
1424 stream as-is (giving a ZWNBSP character). */
1425 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001426 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001427#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001428 if (bom == 0xFEFF) {
1429 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001430 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001431 }
1432 else if (bom == 0xFFFE) {
1433 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001434 bo = 1;
1435 }
1436#else
Tim Peters772747b2001-08-09 22:21:55 +00001437 if (bom == 0xFEFF) {
1438 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001439 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001440 }
1441 else if (bom == 0xFFFE) {
1442 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001443 bo = -1;
1444 }
1445#endif
1446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447
Tim Peters772747b2001-08-09 22:21:55 +00001448 if (bo == -1) {
1449 /* force LE */
1450 ihi = 1;
1451 ilo = 0;
1452 }
1453 else if (bo == 1) {
1454 /* force BE */
1455 ihi = 0;
1456 ilo = 1;
1457 }
1458
1459 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001460 Py_UNICODE ch;
1461 /* remaing bytes at the end? (size should be even) */
1462 if (e-q<2) {
1463 errmsg = "truncated data";
1464 startinpos = ((const char *)q)-starts;
1465 endinpos = ((const char *)e)-starts;
1466 goto utf16Error;
1467 /* The remaining input chars are ignored if the callback
1468 chooses to skip the input */
1469 }
1470 ch = (q[ihi] << 8) | q[ilo];
1471
Tim Peters772747b2001-08-09 22:21:55 +00001472 q += 2;
1473
Guido van Rossumd57fd912000-03-10 22:53:23 +00001474 if (ch < 0xD800 || ch > 0xDFFF) {
1475 *p++ = ch;
1476 continue;
1477 }
1478
1479 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001480 if (q >= e) {
1481 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001482 startinpos = (((const char *)q)-2)-starts;
1483 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001484 goto utf16Error;
1485 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001486 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001487 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1488 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001489 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001490#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001491 *p++ = ch;
1492 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001493#else
1494 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001495#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001496 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001497 }
1498 else {
1499 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001500 startinpos = (((const char *)q)-4)-starts;
1501 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001502 goto utf16Error;
1503 }
1504
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001506 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001507 startinpos = (((const char *)q)-2)-starts;
1508 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001509 /* Fall through to report the error */
1510
1511 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001512 outpos = p-PyUnicode_AS_UNICODE(unicode);
1513 if (unicode_decode_call_errorhandler(
1514 errors, &errorHandler,
1515 "utf16", errmsg,
1516 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1517 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001518 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001519 }
1520
1521 if (byteorder)
1522 *byteorder = bo;
1523
1524 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001525 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 goto onError;
1527
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001528 Py_XDECREF(errorHandler);
1529 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001530 return (PyObject *)unicode;
1531
1532onError:
1533 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001534 Py_XDECREF(errorHandler);
1535 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001536 return NULL;
1537}
1538
Tim Peters772747b2001-08-09 22:21:55 +00001539PyObject *
1540PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1541 int size,
1542 const char *errors,
1543 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001544{
1545 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001546 unsigned char *p;
1547 int i, pairs;
1548 /* Offsets from p for storing byte pairs in the right order. */
1549#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1550 int ihi = 1, ilo = 0;
1551#else
1552 int ihi = 0, ilo = 1;
1553#endif
1554
1555#define STORECHAR(CH) \
1556 do { \
1557 p[ihi] = ((CH) >> 8) & 0xff; \
1558 p[ilo] = (CH) & 0xff; \
1559 p += 2; \
1560 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001562 for (i = pairs = 0; i < size; i++)
1563 if (s[i] >= 0x10000)
1564 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001566 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001567 if (v == NULL)
1568 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569
Tim Peters772747b2001-08-09 22:21:55 +00001570 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001571 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001572 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001573 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001574 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001575
1576 if (byteorder == -1) {
1577 /* force LE */
1578 ihi = 1;
1579 ilo = 0;
1580 }
1581 else if (byteorder == 1) {
1582 /* force BE */
1583 ihi = 0;
1584 ilo = 1;
1585 }
1586
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001587 while (size-- > 0) {
1588 Py_UNICODE ch = *s++;
1589 Py_UNICODE ch2 = 0;
1590 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001591 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1592 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593 }
Tim Peters772747b2001-08-09 22:21:55 +00001594 STORECHAR(ch);
1595 if (ch2)
1596 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001597 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001598 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001599#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001600}
1601
1602PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1603{
1604 if (!PyUnicode_Check(unicode)) {
1605 PyErr_BadArgument();
1606 return NULL;
1607 }
1608 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1609 PyUnicode_GET_SIZE(unicode),
1610 NULL,
1611 0);
1612}
1613
1614/* --- Unicode Escape Codec ----------------------------------------------- */
1615
Fredrik Lundh06d12682001-01-24 07:59:11 +00001616static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001617
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1619 int size,
1620 const char *errors)
1621{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001622 const char *starts = s;
1623 int startinpos;
1624 int endinpos;
1625 int outpos;
1626 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001627 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001628 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001630 char* message;
1631 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001632 PyObject *errorHandler = NULL;
1633 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001634
Guido van Rossumd57fd912000-03-10 22:53:23 +00001635 /* Escaped strings will always be longer than the resulting
1636 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001637 length after conversion to the true value.
1638 (but if the error callback returns a long replacement string
1639 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001640 v = _PyUnicode_New(size);
1641 if (v == NULL)
1642 goto onError;
1643 if (size == 0)
1644 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001645
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001646 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001648
Guido van Rossumd57fd912000-03-10 22:53:23 +00001649 while (s < end) {
1650 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001651 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001652 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001653
1654 /* Non-escape characters are interpreted as Unicode ordinals */
1655 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001656 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657 continue;
1658 }
1659
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001660 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 /* \ - Escapes */
1662 s++;
1663 switch (*s++) {
1664
1665 /* \x escapes */
1666 case '\n': break;
1667 case '\\': *p++ = '\\'; break;
1668 case '\'': *p++ = '\''; break;
1669 case '\"': *p++ = '\"'; break;
1670 case 'b': *p++ = '\b'; break;
1671 case 'f': *p++ = '\014'; break; /* FF */
1672 case 't': *p++ = '\t'; break;
1673 case 'n': *p++ = '\n'; break;
1674 case 'r': *p++ = '\r'; break;
1675 case 'v': *p++ = '\013'; break; /* VT */
1676 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1677
1678 /* \OOO (octal) escapes */
1679 case '0': case '1': case '2': case '3':
1680 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001681 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001682 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001683 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001685 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001687 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001688 break;
1689
Fredrik Lundhccc74732001-02-18 22:13:49 +00001690 /* hex escapes */
1691 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001693 digits = 2;
1694 message = "truncated \\xXX escape";
1695 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696
Fredrik Lundhccc74732001-02-18 22:13:49 +00001697 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001699 digits = 4;
1700 message = "truncated \\uXXXX escape";
1701 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702
Fredrik Lundhccc74732001-02-18 22:13:49 +00001703 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001704 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001705 digits = 8;
1706 message = "truncated \\UXXXXXXXX escape";
1707 hexescape:
1708 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001709 outpos = p-PyUnicode_AS_UNICODE(v);
1710 if (s+digits>end) {
1711 endinpos = size;
1712 if (unicode_decode_call_errorhandler(
1713 errors, &errorHandler,
1714 "unicodeescape", "end of string in escape sequence",
1715 starts, size, &startinpos, &endinpos, &exc, &s,
1716 (PyObject **)&v, &outpos, &p))
1717 goto onError;
1718 goto nextByte;
1719 }
1720 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001721 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001722 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001723 endinpos = (s+i+1)-starts;
1724 if (unicode_decode_call_errorhandler(
1725 errors, &errorHandler,
1726 "unicodeescape", message,
1727 starts, size, &startinpos, &endinpos, &exc, &s,
1728 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001729 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001730 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001731 }
1732 chr = (chr<<4) & ~0xF;
1733 if (c >= '0' && c <= '9')
1734 chr += c - '0';
1735 else if (c >= 'a' && c <= 'f')
1736 chr += 10 + c - 'a';
1737 else
1738 chr += 10 + c - 'A';
1739 }
1740 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001741 if (chr == 0xffffffff)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742 /* _decoding_error will have already written into the
1743 target buffer. */
1744 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001745 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001746 /* when we get here, chr is a 32-bit unicode character */
1747 if (chr <= 0xffff)
1748 /* UCS-2 character */
1749 *p++ = (Py_UNICODE) chr;
1750 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001751 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001752 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001753#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001754 *p++ = chr;
1755#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001756 chr -= 0x10000L;
1757 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001758 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001759#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001760 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001761 endinpos = s-starts;
1762 outpos = p-PyUnicode_AS_UNICODE(v);
1763 if (unicode_decode_call_errorhandler(
1764 errors, &errorHandler,
1765 "unicodeescape", "illegal Unicode character",
1766 starts, size, &startinpos, &endinpos, &exc, &s,
1767 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001768 goto onError;
1769 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001770 break;
1771
1772 /* \N{name} */
1773 case 'N':
1774 message = "malformed \\N character escape";
1775 if (ucnhash_CAPI == NULL) {
1776 /* load the unicode data module */
1777 PyObject *m, *v;
1778 m = PyImport_ImportModule("unicodedata");
1779 if (m == NULL)
1780 goto ucnhashError;
1781 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1782 Py_DECREF(m);
1783 if (v == NULL)
1784 goto ucnhashError;
1785 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1786 Py_DECREF(v);
1787 if (ucnhash_CAPI == NULL)
1788 goto ucnhashError;
1789 }
1790 if (*s == '{') {
1791 const char *start = s+1;
1792 /* look for the closing brace */
1793 while (*s != '}' && s < end)
1794 s++;
1795 if (s > start && s < end && *s == '}') {
1796 /* found a name. look it up in the unicode database */
1797 message = "unknown Unicode character name";
1798 s++;
1799 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1800 goto store;
1801 }
1802 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 endinpos = s-starts;
1804 outpos = p-PyUnicode_AS_UNICODE(v);
1805 if (unicode_decode_call_errorhandler(
1806 errors, &errorHandler,
1807 "unicodeescape", message,
1808 starts, size, &startinpos, &endinpos, &exc, &s,
1809 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001810 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001811 break;
1812
1813 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001814 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001815 message = "\\ at end of string";
1816 s--;
1817 endinpos = s-starts;
1818 outpos = p-PyUnicode_AS_UNICODE(v);
1819 if (unicode_decode_call_errorhandler(
1820 errors, &errorHandler,
1821 "unicodeescape", message,
1822 starts, size, &startinpos, &endinpos, &exc, &s,
1823 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001824 goto onError;
1825 }
1826 else {
1827 *p++ = '\\';
1828 *p++ = (unsigned char)s[-1];
1829 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001830 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001831 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001832 nextByte:
1833 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001834 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001835 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
1836 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001837 Py_XDECREF(errorHandler);
1838 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001840
Fredrik Lundhccc74732001-02-18 22:13:49 +00001841ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001842 PyErr_SetString(
1843 PyExc_UnicodeError,
1844 "\\N escapes not supported (can't load unicodedata module)"
1845 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001846 Py_XDECREF(errorHandler);
1847 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001848 return NULL;
1849
Fredrik Lundhccc74732001-02-18 22:13:49 +00001850onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001852 Py_XDECREF(errorHandler);
1853 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854 return NULL;
1855}
1856
1857/* Return a Unicode-Escape string version of the Unicode object.
1858
1859 If quotes is true, the string is enclosed in u"" or u'' quotes as
1860 appropriate.
1861
1862*/
1863
Barry Warsaw51ac5802000-03-20 16:36:48 +00001864static const Py_UNICODE *findchar(const Py_UNICODE *s,
1865 int size,
1866 Py_UNICODE ch);
1867
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868static
1869PyObject *unicodeescape_string(const Py_UNICODE *s,
1870 int size,
1871 int quotes)
1872{
1873 PyObject *repr;
1874 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001876 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001877
1878 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1879 if (repr == NULL)
1880 return NULL;
1881
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001882 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883
1884 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001885 *p++ = 'u';
1886 *p++ = (findchar(s, size, '\'') &&
1887 !findchar(s, size, '"')) ? '"' : '\'';
1888 }
1889 while (size-- > 0) {
1890 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001891
Guido van Rossumd57fd912000-03-10 22:53:23 +00001892 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001893 if (quotes &&
1894 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895 *p++ = '\\';
1896 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001897 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001898 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001899
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001900#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001901 /* Map 21-bit characters to '\U00xxxxxx' */
1902 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001903 int offset = p - PyString_AS_STRING(repr);
1904
1905 /* Resize the string if necessary */
1906 if (offset + 12 > PyString_GET_SIZE(repr)) {
1907 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001908 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001909 p = PyString_AS_STRING(repr) + offset;
1910 }
1911
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912 *p++ = '\\';
1913 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001914 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1915 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1916 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1917 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1918 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1919 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1920 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001921 *p++ = hexdigit[ch & 0x0000000F];
1922 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001923 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001924#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001925 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1926 else if (ch >= 0xD800 && ch < 0xDC00) {
1927 Py_UNICODE ch2;
1928 Py_UCS4 ucs;
1929
1930 ch2 = *s++;
1931 size--;
1932 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1933 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1934 *p++ = '\\';
1935 *p++ = 'U';
1936 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1937 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1938 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1939 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1940 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1941 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1942 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1943 *p++ = hexdigit[ucs & 0x0000000F];
1944 continue;
1945 }
1946 /* Fall through: isolated surrogates are copied as-is */
1947 s--;
1948 size++;
1949 }
1950
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001952 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001953 *p++ = '\\';
1954 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001955 *p++ = hexdigit[(ch >> 12) & 0x000F];
1956 *p++ = hexdigit[(ch >> 8) & 0x000F];
1957 *p++ = hexdigit[(ch >> 4) & 0x000F];
1958 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001960
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001961 /* Map special whitespace to '\t', \n', '\r' */
1962 else if (ch == '\t') {
1963 *p++ = '\\';
1964 *p++ = 't';
1965 }
1966 else if (ch == '\n') {
1967 *p++ = '\\';
1968 *p++ = 'n';
1969 }
1970 else if (ch == '\r') {
1971 *p++ = '\\';
1972 *p++ = 'r';
1973 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001974
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001975 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001976 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001978 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001979 *p++ = hexdigit[(ch >> 4) & 0x000F];
1980 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001982
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983 /* Copy everything else as-is */
1984 else
1985 *p++ = (char) ch;
1986 }
1987 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001988 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989
1990 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00001991 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 return repr;
1993}
1994
1995PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1996 int size)
1997{
1998 return unicodeescape_string(s, size, 0);
1999}
2000
2001PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2002{
2003 if (!PyUnicode_Check(unicode)) {
2004 PyErr_BadArgument();
2005 return NULL;
2006 }
2007 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2008 PyUnicode_GET_SIZE(unicode));
2009}
2010
2011/* --- Raw Unicode Escape Codec ------------------------------------------- */
2012
2013PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2014 int size,
2015 const char *errors)
2016{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002017 const char *starts = s;
2018 int startinpos;
2019 int endinpos;
2020 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002022 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002023 const char *end;
2024 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002025 PyObject *errorHandler = NULL;
2026 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002027
2028 /* Escaped strings will always be longer than the resulting
2029 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002030 length after conversion to the true value. (But decoding error
2031 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002032 v = _PyUnicode_New(size);
2033 if (v == NULL)
2034 goto onError;
2035 if (size == 0)
2036 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002037 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038 end = s + size;
2039 while (s < end) {
2040 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002041 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002043 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044
2045 /* Non-escape characters are interpreted as Unicode ordinals */
2046 if (*s != '\\') {
2047 *p++ = (unsigned char)*s++;
2048 continue;
2049 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002050 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051
2052 /* \u-escapes are only interpreted iff the number of leading
2053 backslashes if odd */
2054 bs = s;
2055 for (;s < end;) {
2056 if (*s != '\\')
2057 break;
2058 *p++ = (unsigned char)*s++;
2059 }
2060 if (((s - bs) & 1) == 0 ||
2061 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002062 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063 continue;
2064 }
2065 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002066 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067 s++;
2068
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002069 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002070 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002071 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002072 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002074 endinpos = s-starts;
2075 if (unicode_decode_call_errorhandler(
2076 errors, &errorHandler,
2077 "rawunicodeescape", "truncated \\uXXXX",
2078 starts, size, &startinpos, &endinpos, &exc, &s,
2079 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002081 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082 }
2083 x = (x<<4) & ~0xF;
2084 if (c >= '0' && c <= '9')
2085 x += c - '0';
2086 else if (c >= 'a' && c <= 'f')
2087 x += 10 + c - 'a';
2088 else
2089 x += 10 + c - 'A';
2090 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002091#ifndef Py_UNICODE_WIDE
2092 if (x > 0x10000) {
2093 if (unicode_decode_call_errorhandler(
2094 errors, &errorHandler,
2095 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2096 starts, size, &startinpos, &endinpos, &exc, &s,
2097 (PyObject **)&v, &outpos, &p))
2098 goto onError;
2099 }
2100#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002101 *p++ = x;
2102 nextByte:
2103 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002105 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002106 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002107 Py_XDECREF(errorHandler);
2108 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109 return (PyObject *)v;
2110
2111 onError:
2112 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002113 Py_XDECREF(errorHandler);
2114 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002115 return NULL;
2116}
2117
2118PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2119 int size)
2120{
2121 PyObject *repr;
2122 char *p;
2123 char *q;
2124
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002125 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002127#ifdef Py_UNICODE_WIDE
2128 repr = PyString_FromStringAndSize(NULL, 10 * size);
2129#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002131#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002132 if (repr == NULL)
2133 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002134 if (size == 0)
2135 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136
2137 p = q = PyString_AS_STRING(repr);
2138 while (size-- > 0) {
2139 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002140#ifdef Py_UNICODE_WIDE
2141 /* Map 32-bit characters to '\Uxxxxxxxx' */
2142 if (ch >= 0x10000) {
2143 *p++ = '\\';
2144 *p++ = 'U';
2145 *p++ = hexdigit[(ch >> 28) & 0xf];
2146 *p++ = hexdigit[(ch >> 24) & 0xf];
2147 *p++ = hexdigit[(ch >> 20) & 0xf];
2148 *p++ = hexdigit[(ch >> 16) & 0xf];
2149 *p++ = hexdigit[(ch >> 12) & 0xf];
2150 *p++ = hexdigit[(ch >> 8) & 0xf];
2151 *p++ = hexdigit[(ch >> 4) & 0xf];
2152 *p++ = hexdigit[ch & 15];
2153 }
2154 else
2155#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156 /* Map 16-bit characters to '\uxxxx' */
2157 if (ch >= 256) {
2158 *p++ = '\\';
2159 *p++ = 'u';
2160 *p++ = hexdigit[(ch >> 12) & 0xf];
2161 *p++ = hexdigit[(ch >> 8) & 0xf];
2162 *p++ = hexdigit[(ch >> 4) & 0xf];
2163 *p++ = hexdigit[ch & 15];
2164 }
2165 /* Copy everything else as-is */
2166 else
2167 *p++ = (char) ch;
2168 }
2169 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002170 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002171 return repr;
2172}
2173
2174PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2175{
2176 if (!PyUnicode_Check(unicode)) {
2177 PyErr_BadArgument();
2178 return NULL;
2179 }
2180 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2181 PyUnicode_GET_SIZE(unicode));
2182}
2183
2184/* --- Latin-1 Codec ------------------------------------------------------ */
2185
2186PyObject *PyUnicode_DecodeLatin1(const char *s,
2187 int size,
2188 const char *errors)
2189{
2190 PyUnicodeObject *v;
2191 Py_UNICODE *p;
2192
2193 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002194 if (size == 1 && *(unsigned char*)s < 256) {
2195 Py_UNICODE r = *(unsigned char*)s;
2196 return PyUnicode_FromUnicode(&r, 1);
2197 }
2198
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199 v = _PyUnicode_New(size);
2200 if (v == NULL)
2201 goto onError;
2202 if (size == 0)
2203 return (PyObject *)v;
2204 p = PyUnicode_AS_UNICODE(v);
2205 while (size-- > 0)
2206 *p++ = (unsigned char)*s++;
2207 return (PyObject *)v;
2208
2209 onError:
2210 Py_XDECREF(v);
2211 return NULL;
2212}
2213
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002214/* create or adjust a UnicodeEncodeError */
2215static void make_encode_exception(PyObject **exceptionObject,
2216 const char *encoding,
2217 const Py_UNICODE *unicode, int size,
2218 int startpos, int endpos,
2219 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002220{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002221 if (*exceptionObject == NULL) {
2222 *exceptionObject = PyUnicodeEncodeError_Create(
2223 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224 }
2225 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002226 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2227 goto onError;
2228 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2229 goto onError;
2230 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2231 goto onError;
2232 return;
2233 onError:
2234 Py_DECREF(*exceptionObject);
2235 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236 }
2237}
2238
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002239/* raises a UnicodeEncodeError */
2240static void raise_encode_exception(PyObject **exceptionObject,
2241 const char *encoding,
2242 const Py_UNICODE *unicode, int size,
2243 int startpos, int endpos,
2244 const char *reason)
2245{
2246 make_encode_exception(exceptionObject,
2247 encoding, unicode, size, startpos, endpos, reason);
2248 if (*exceptionObject != NULL)
2249 PyCodec_StrictErrors(*exceptionObject);
2250}
2251
2252/* error handling callback helper:
2253 build arguments, call the callback and check the arguments,
2254 put the result into newpos and return the replacement string, which
2255 has to be freed by the caller */
2256static PyObject *unicode_encode_call_errorhandler(const char *errors,
2257 PyObject **errorHandler,
2258 const char *encoding, const char *reason,
2259 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2260 int startpos, int endpos,
2261 int *newpos)
2262{
2263 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2264
2265 PyObject *restuple;
2266 PyObject *resunicode;
2267
2268 if (*errorHandler == NULL) {
2269 *errorHandler = PyCodec_LookupError(errors);
2270 if (*errorHandler == NULL)
2271 return NULL;
2272 }
2273
2274 make_encode_exception(exceptionObject,
2275 encoding, unicode, size, startpos, endpos, reason);
2276 if (*exceptionObject == NULL)
2277 return NULL;
2278
2279 restuple = PyObject_CallFunctionObjArgs(
2280 *errorHandler, *exceptionObject, NULL);
2281 if (restuple == NULL)
2282 return NULL;
2283 if (!PyTuple_Check(restuple)) {
2284 PyErr_Format(PyExc_TypeError, &argparse[4]);
2285 Py_DECREF(restuple);
2286 return NULL;
2287 }
2288 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2289 &resunicode, newpos)) {
2290 Py_DECREF(restuple);
2291 return NULL;
2292 }
2293 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002294 *newpos = size+*newpos;
2295 if (*newpos<0 || *newpos>size) {
2296 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2297 Py_DECREF(restuple);
2298 return NULL;
2299 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002300 Py_INCREF(resunicode);
2301 Py_DECREF(restuple);
2302 return resunicode;
2303}
2304
2305static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2306 int size,
2307 const char *errors,
2308 int limit)
2309{
2310 /* output object */
2311 PyObject *res;
2312 /* pointers to the beginning and end+1 of input */
2313 const Py_UNICODE *startp = p;
2314 const Py_UNICODE *endp = p + size;
2315 /* pointer to the beginning of the unencodable characters */
2316 /* const Py_UNICODE *badp = NULL; */
2317 /* pointer into the output */
2318 char *str;
2319 /* current output position */
2320 int respos = 0;
2321 int ressize;
2322 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2323 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2324 PyObject *errorHandler = NULL;
2325 PyObject *exc = NULL;
2326 /* the following variable is used for caching string comparisons
2327 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2328 int known_errorHandler = -1;
2329
2330 /* allocate enough for a simple encoding without
2331 replacements, if we need more, we'll resize */
2332 res = PyString_FromStringAndSize(NULL, size);
2333 if (res == NULL)
2334 goto onError;
2335 if (size == 0)
2336 return res;
2337 str = PyString_AS_STRING(res);
2338 ressize = size;
2339
2340 while (p<endp) {
2341 Py_UNICODE c = *p;
2342
2343 /* can we encode this? */
2344 if (c<limit) {
2345 /* no overflow check, because we know that the space is enough */
2346 *str++ = (char)c;
2347 ++p;
2348 }
2349 else {
2350 int unicodepos = p-startp;
2351 int requiredsize;
2352 PyObject *repunicode;
2353 int repsize;
2354 int newpos;
2355 int respos;
2356 Py_UNICODE *uni2;
2357 /* startpos for collecting unencodable chars */
2358 const Py_UNICODE *collstart = p;
2359 const Py_UNICODE *collend = p;
2360 /* find all unecodable characters */
2361 while ((collend < endp) && ((*collend)>=limit))
2362 ++collend;
2363 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2364 if (known_errorHandler==-1) {
2365 if ((errors==NULL) || (!strcmp(errors, "strict")))
2366 known_errorHandler = 1;
2367 else if (!strcmp(errors, "replace"))
2368 known_errorHandler = 2;
2369 else if (!strcmp(errors, "ignore"))
2370 known_errorHandler = 3;
2371 else if (!strcmp(errors, "xmlcharrefreplace"))
2372 known_errorHandler = 4;
2373 else
2374 known_errorHandler = 0;
2375 }
2376 switch (known_errorHandler) {
2377 case 1: /* strict */
2378 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2379 goto onError;
2380 case 2: /* replace */
2381 while (collstart++<collend)
2382 *str++ = '?'; /* fall through */
2383 case 3: /* ignore */
2384 p = collend;
2385 break;
2386 case 4: /* xmlcharrefreplace */
2387 respos = str-PyString_AS_STRING(res);
2388 /* determine replacement size (temporarily (mis)uses p) */
2389 for (p = collstart, repsize = 0; p < collend; ++p) {
2390 if (*p<10)
2391 repsize += 2+1+1;
2392 else if (*p<100)
2393 repsize += 2+2+1;
2394 else if (*p<1000)
2395 repsize += 2+3+1;
2396 else if (*p<10000)
2397 repsize += 2+4+1;
2398 else if (*p<100000)
2399 repsize += 2+5+1;
2400 else if (*p<1000000)
2401 repsize += 2+6+1;
2402 else
2403 repsize += 2+7+1;
2404 }
2405 requiredsize = respos+repsize+(endp-collend);
2406 if (requiredsize > ressize) {
2407 if (requiredsize<2*ressize)
2408 requiredsize = 2*ressize;
2409 if (_PyString_Resize(&res, requiredsize))
2410 goto onError;
2411 str = PyString_AS_STRING(res) + respos;
2412 ressize = requiredsize;
2413 }
2414 /* generate replacement (temporarily (mis)uses p) */
2415 for (p = collstart; p < collend; ++p) {
2416 str += sprintf(str, "&#%d;", (int)*p);
2417 }
2418 p = collend;
2419 break;
2420 default:
2421 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2422 encoding, reason, startp, size, &exc,
2423 collstart-startp, collend-startp, &newpos);
2424 if (repunicode == NULL)
2425 goto onError;
2426 /* need more space? (at least enough for what we
2427 have+the replacement+the rest of the string, so
2428 we won't have to check space for encodable characters) */
2429 respos = str-PyString_AS_STRING(res);
2430 repsize = PyUnicode_GET_SIZE(repunicode);
2431 requiredsize = respos+repsize+(endp-collend);
2432 if (requiredsize > ressize) {
2433 if (requiredsize<2*ressize)
2434 requiredsize = 2*ressize;
2435 if (_PyString_Resize(&res, requiredsize)) {
2436 Py_DECREF(repunicode);
2437 goto onError;
2438 }
2439 str = PyString_AS_STRING(res) + respos;
2440 ressize = requiredsize;
2441 }
2442 /* check if there is anything unencodable in the replacement
2443 and copy it to the output */
2444 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2445 c = *uni2;
2446 if (c >= limit) {
2447 raise_encode_exception(&exc, encoding, startp, size,
2448 unicodepos, unicodepos+1, reason);
2449 Py_DECREF(repunicode);
2450 goto onError;
2451 }
2452 *str = (char)c;
2453 }
2454 p = startp + newpos;
2455 Py_DECREF(repunicode);
2456 }
2457 }
2458 }
2459 /* Resize if we allocated to much */
2460 respos = str-PyString_AS_STRING(res);
2461 if (respos<ressize)
2462 /* If this falls res will be NULL */
2463 _PyString_Resize(&res, respos);
2464 Py_XDECREF(errorHandler);
2465 Py_XDECREF(exc);
2466 return res;
2467
2468 onError:
2469 Py_XDECREF(res);
2470 Py_XDECREF(errorHandler);
2471 Py_XDECREF(exc);
2472 return NULL;
2473}
2474
Guido van Rossumd57fd912000-03-10 22:53:23 +00002475PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2476 int size,
2477 const char *errors)
2478{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002479 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480}
2481
2482PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2483{
2484 if (!PyUnicode_Check(unicode)) {
2485 PyErr_BadArgument();
2486 return NULL;
2487 }
2488 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2489 PyUnicode_GET_SIZE(unicode),
2490 NULL);
2491}
2492
2493/* --- 7-bit ASCII Codec -------------------------------------------------- */
2494
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495PyObject *PyUnicode_DecodeASCII(const char *s,
2496 int size,
2497 const char *errors)
2498{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002499 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500 PyUnicodeObject *v;
2501 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002502 int startinpos;
2503 int endinpos;
2504 int outpos;
2505 const char *e;
2506 PyObject *errorHandler = NULL;
2507 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508
2509 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002510 if (size == 1 && *(unsigned char*)s < 128) {
2511 Py_UNICODE r = *(unsigned char*)s;
2512 return PyUnicode_FromUnicode(&r, 1);
2513 }
2514
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515 v = _PyUnicode_New(size);
2516 if (v == NULL)
2517 goto onError;
2518 if (size == 0)
2519 return (PyObject *)v;
2520 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002521 e = s + size;
2522 while (s < e) {
2523 register unsigned char c = (unsigned char)*s;
2524 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002526 ++s;
2527 }
2528 else {
2529 startinpos = s-starts;
2530 endinpos = startinpos + 1;
2531 outpos = p-PyUnicode_AS_UNICODE(v);
2532 if (unicode_decode_call_errorhandler(
2533 errors, &errorHandler,
2534 "ascii", "ordinal not in range(128)",
2535 starts, size, &startinpos, &endinpos, &exc, &s,
2536 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002540 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002541 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002542 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002543 Py_XDECREF(errorHandler);
2544 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545 return (PyObject *)v;
2546
2547 onError:
2548 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002549 Py_XDECREF(errorHandler);
2550 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551 return NULL;
2552}
2553
Guido van Rossumd57fd912000-03-10 22:53:23 +00002554PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2555 int size,
2556 const char *errors)
2557{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002558 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559}
2560
2561PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2562{
2563 if (!PyUnicode_Check(unicode)) {
2564 PyErr_BadArgument();
2565 return NULL;
2566 }
2567 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2568 PyUnicode_GET_SIZE(unicode),
2569 NULL);
2570}
2571
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002572#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002573
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002574/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002575
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002576PyObject *PyUnicode_DecodeMBCS(const char *s,
2577 int size,
2578 const char *errors)
2579{
2580 PyUnicodeObject *v;
2581 Py_UNICODE *p;
2582
2583 /* First get the size of the result */
2584 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002585 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002586 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2587
2588 v = _PyUnicode_New(usize);
2589 if (v == NULL)
2590 return NULL;
2591 if (usize == 0)
2592 return (PyObject *)v;
2593 p = PyUnicode_AS_UNICODE(v);
2594 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2595 Py_DECREF(v);
2596 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2597 }
2598
2599 return (PyObject *)v;
2600}
2601
2602PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2603 int size,
2604 const char *errors)
2605{
2606 PyObject *repr;
2607 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002608 DWORD mbcssize;
2609
2610 /* If there are no characters, bail now! */
2611 if (size==0)
2612 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002613
2614 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002615 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002616 if (mbcssize==0)
2617 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2618
2619 repr = PyString_FromStringAndSize(NULL, mbcssize);
2620 if (repr == NULL)
2621 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002622 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002623 return repr;
2624
2625 /* Do the conversion */
2626 s = PyString_AS_STRING(repr);
2627 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2628 Py_DECREF(repr);
2629 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2630 }
2631 return repr;
2632}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002633
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002634PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2635{
2636 if (!PyUnicode_Check(unicode)) {
2637 PyErr_BadArgument();
2638 return NULL;
2639 }
2640 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2641 PyUnicode_GET_SIZE(unicode),
2642 NULL);
2643}
2644
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002645#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002646
Guido van Rossumd57fd912000-03-10 22:53:23 +00002647/* --- Character Mapping Codec -------------------------------------------- */
2648
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649PyObject *PyUnicode_DecodeCharmap(const char *s,
2650 int size,
2651 PyObject *mapping,
2652 const char *errors)
2653{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002654 const char *starts = s;
2655 int startinpos;
2656 int endinpos;
2657 int outpos;
2658 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659 PyUnicodeObject *v;
2660 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002661 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002662 PyObject *errorHandler = NULL;
2663 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664
2665 /* Default to Latin-1 */
2666 if (mapping == NULL)
2667 return PyUnicode_DecodeLatin1(s, size, errors);
2668
2669 v = _PyUnicode_New(size);
2670 if (v == NULL)
2671 goto onError;
2672 if (size == 0)
2673 return (PyObject *)v;
2674 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002675 e = s + size;
2676 while (s < e) {
2677 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678 PyObject *w, *x;
2679
2680 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2681 w = PyInt_FromLong((long)ch);
2682 if (w == NULL)
2683 goto onError;
2684 x = PyObject_GetItem(mapping, w);
2685 Py_DECREF(w);
2686 if (x == NULL) {
2687 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002688 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002689 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002690 x = Py_None;
2691 Py_INCREF(x);
2692 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002693 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 }
2695
2696 /* Apply mapping */
2697 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002698 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 if (value < 0 || value > 65535) {
2700 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002701 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702 Py_DECREF(x);
2703 goto onError;
2704 }
2705 *p++ = (Py_UNICODE)value;
2706 }
2707 else if (x == Py_None) {
2708 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002709 outpos = p-PyUnicode_AS_UNICODE(v);
2710 startinpos = s-starts;
2711 endinpos = startinpos+1;
2712 if (unicode_decode_call_errorhandler(
2713 errors, &errorHandler,
2714 "charmap", "character maps to <undefined>",
2715 starts, size, &startinpos, &endinpos, &exc, &s,
2716 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002717 Py_DECREF(x);
2718 goto onError;
2719 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002720 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721 }
2722 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002723 int targetsize = PyUnicode_GET_SIZE(x);
2724
2725 if (targetsize == 1)
2726 /* 1-1 mapping */
2727 *p++ = *PyUnicode_AS_UNICODE(x);
2728
2729 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002731 if (targetsize > extrachars) {
2732 /* resize first */
2733 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2734 int needed = (targetsize - extrachars) + \
2735 (targetsize << 2);
2736 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002737 if (_PyUnicode_Resize(&v,
2738 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002739 Py_DECREF(x);
2740 goto onError;
2741 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002742 p = PyUnicode_AS_UNICODE(v) + oldpos;
2743 }
2744 Py_UNICODE_COPY(p,
2745 PyUnicode_AS_UNICODE(x),
2746 targetsize);
2747 p += targetsize;
2748 extrachars -= targetsize;
2749 }
2750 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 }
2752 else {
2753 /* wrong return value */
2754 PyErr_SetString(PyExc_TypeError,
2755 "character mapping must return integer, None or unicode");
2756 Py_DECREF(x);
2757 goto onError;
2758 }
2759 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002760 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 }
2762 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002763 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002765 Py_XDECREF(errorHandler);
2766 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767 return (PyObject *)v;
2768
2769 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002770 Py_XDECREF(errorHandler);
2771 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772 Py_XDECREF(v);
2773 return NULL;
2774}
2775
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002776/* Lookup the character ch in the mapping. If the character
2777 can't be found, Py_None is returned (or NULL, if another
2778 error occured). */
2779static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002781 PyObject *w = PyInt_FromLong((long)c);
2782 PyObject *x;
2783
2784 if (w == NULL)
2785 return NULL;
2786 x = PyObject_GetItem(mapping, w);
2787 Py_DECREF(w);
2788 if (x == NULL) {
2789 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2790 /* No mapping found means: mapping is undefined. */
2791 PyErr_Clear();
2792 x = Py_None;
2793 Py_INCREF(x);
2794 return x;
2795 } else
2796 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002798 else if (x == Py_None)
2799 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002800 else if (PyInt_Check(x)) {
2801 long value = PyInt_AS_LONG(x);
2802 if (value < 0 || value > 255) {
2803 PyErr_SetString(PyExc_TypeError,
2804 "character mapping must be in range(256)");
2805 Py_DECREF(x);
2806 return NULL;
2807 }
2808 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002810 else if (PyString_Check(x))
2811 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002813 /* wrong return value */
2814 PyErr_SetString(PyExc_TypeError,
2815 "character mapping must return integer, None or str");
2816 Py_DECREF(x);
2817 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818 }
2819}
2820
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002821/* lookup the character, put the result in the output string and adjust
2822 various state variables. Reallocate the output string if not enough
2823 space is available. Return a new reference to the object that
2824 was put in the output buffer, or Py_None, if the mapping was undefined
2825 (in which case no character was written) or NULL, if a
2826 reallocation error ocurred. The called must decref the result */
2827static
2828PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2829 PyObject **outobj, int *outpos)
2830{
2831 PyObject *rep = charmapencode_lookup(c, mapping);
2832
2833 if (rep==NULL)
2834 return NULL;
2835 else if (rep==Py_None)
2836 return rep;
2837 else {
2838 char *outstart = PyString_AS_STRING(*outobj);
2839 int outsize = PyString_GET_SIZE(*outobj);
2840 if (PyInt_Check(rep)) {
2841 int requiredsize = *outpos+1;
2842 if (outsize<requiredsize) {
2843 /* exponentially overallocate to minimize reallocations */
2844 if (requiredsize < 2*outsize)
2845 requiredsize = 2*outsize;
2846 if (_PyString_Resize(outobj, requiredsize)) {
2847 Py_DECREF(rep);
2848 return NULL;
2849 }
2850 outstart = PyString_AS_STRING(*outobj);
2851 }
2852 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2853 }
2854 else {
2855 const char *repchars = PyString_AS_STRING(rep);
2856 int repsize = PyString_GET_SIZE(rep);
2857 int requiredsize = *outpos+repsize;
2858 if (outsize<requiredsize) {
2859 /* exponentially overallocate to minimize reallocations */
2860 if (requiredsize < 2*outsize)
2861 requiredsize = 2*outsize;
2862 if (_PyString_Resize(outobj, requiredsize)) {
2863 Py_DECREF(rep);
2864 return NULL;
2865 }
2866 outstart = PyString_AS_STRING(*outobj);
2867 }
2868 memcpy(outstart + *outpos, repchars, repsize);
2869 *outpos += repsize;
2870 }
2871 }
2872 return rep;
2873}
2874
2875/* handle an error in PyUnicode_EncodeCharmap
2876 Return 0 on success, -1 on error */
2877static
2878int charmap_encoding_error(
2879 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2880 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002881 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002882 PyObject **res, int *respos)
2883{
2884 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2885 int repsize;
2886 int newpos;
2887 Py_UNICODE *uni2;
2888 /* startpos for collecting unencodable chars */
2889 int collstartpos = *inpos;
2890 int collendpos = *inpos+1;
2891 int collpos;
2892 char *encoding = "charmap";
2893 char *reason = "character maps to <undefined>";
2894
2895 PyObject *x;
2896 /* find all unencodable characters */
2897 while (collendpos < size) {
2898 x = charmapencode_lookup(p[collendpos], mapping);
2899 if (x==NULL)
2900 return -1;
2901 else if (x!=Py_None) {
2902 Py_DECREF(x);
2903 break;
2904 }
2905 Py_DECREF(x);
2906 ++collendpos;
2907 }
2908 /* cache callback name lookup
2909 * (if not done yet, i.e. it's the first error) */
2910 if (*known_errorHandler==-1) {
2911 if ((errors==NULL) || (!strcmp(errors, "strict")))
2912 *known_errorHandler = 1;
2913 else if (!strcmp(errors, "replace"))
2914 *known_errorHandler = 2;
2915 else if (!strcmp(errors, "ignore"))
2916 *known_errorHandler = 3;
2917 else if (!strcmp(errors, "xmlcharrefreplace"))
2918 *known_errorHandler = 4;
2919 else
2920 *known_errorHandler = 0;
2921 }
2922 switch (*known_errorHandler) {
2923 case 1: /* strict */
2924 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2925 return -1;
2926 case 2: /* replace */
2927 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2928 x = charmapencode_output('?', mapping, res, respos);
2929 if (x==NULL) {
2930 return -1;
2931 }
2932 else if (x==Py_None) {
2933 Py_DECREF(x);
2934 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2935 return -1;
2936 }
2937 Py_DECREF(x);
2938 }
2939 /* fall through */
2940 case 3: /* ignore */
2941 *inpos = collendpos;
2942 break;
2943 case 4: /* xmlcharrefreplace */
2944 /* generate replacement (temporarily (mis)uses p) */
2945 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2946 char buffer[2+29+1+1];
2947 char *cp;
2948 sprintf(buffer, "&#%d;", (int)p[collpos]);
2949 for (cp = buffer; *cp; ++cp) {
2950 x = charmapencode_output(*cp, mapping, res, respos);
2951 if (x==NULL)
2952 return -1;
2953 else if (x==Py_None) {
2954 Py_DECREF(x);
2955 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2956 return -1;
2957 }
2958 Py_DECREF(x);
2959 }
2960 }
2961 *inpos = collendpos;
2962 break;
2963 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002964 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002965 encoding, reason, p, size, exceptionObject,
2966 collstartpos, collendpos, &newpos);
2967 if (repunicode == NULL)
2968 return -1;
2969 /* generate replacement */
2970 repsize = PyUnicode_GET_SIZE(repunicode);
2971 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2972 x = charmapencode_output(*uni2, mapping, res, respos);
2973 if (x==NULL) {
2974 Py_DECREF(repunicode);
2975 return -1;
2976 }
2977 else if (x==Py_None) {
2978 Py_DECREF(repunicode);
2979 Py_DECREF(x);
2980 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2981 return -1;
2982 }
2983 Py_DECREF(x);
2984 }
2985 *inpos = newpos;
2986 Py_DECREF(repunicode);
2987 }
2988 return 0;
2989}
2990
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2992 int size,
2993 PyObject *mapping,
2994 const char *errors)
2995{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002996 /* output object */
2997 PyObject *res = NULL;
2998 /* current input position */
2999 int inpos = 0;
3000 /* current output position */
3001 int respos = 0;
3002 PyObject *errorHandler = NULL;
3003 PyObject *exc = NULL;
3004 /* the following variable is used for caching string comparisons
3005 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3006 * 3=ignore, 4=xmlcharrefreplace */
3007 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003008
3009 /* Default to Latin-1 */
3010 if (mapping == NULL)
3011 return PyUnicode_EncodeLatin1(p, size, errors);
3012
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003013 /* allocate enough for a simple encoding without
3014 replacements, if we need more, we'll resize */
3015 res = PyString_FromStringAndSize(NULL, size);
3016 if (res == NULL)
3017 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003018 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003019 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003021 while (inpos<size) {
3022 /* try to encode it */
3023 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3024 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003025 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003026 if (x==Py_None) { /* unencodable character */
3027 if (charmap_encoding_error(p, size, &inpos, mapping,
3028 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003029 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003030 &res, &respos)) {
3031 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003032 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003033 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003035 else
3036 /* done with this character => adjust input position */
3037 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 Py_DECREF(x);
3039 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003041 /* Resize if we allocated to much */
3042 if (respos<PyString_GET_SIZE(res)) {
3043 if (_PyString_Resize(&res, respos))
3044 goto onError;
3045 }
3046 Py_XDECREF(exc);
3047 Py_XDECREF(errorHandler);
3048 return res;
3049
3050 onError:
3051 Py_XDECREF(res);
3052 Py_XDECREF(exc);
3053 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054 return NULL;
3055}
3056
3057PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3058 PyObject *mapping)
3059{
3060 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3061 PyErr_BadArgument();
3062 return NULL;
3063 }
3064 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3065 PyUnicode_GET_SIZE(unicode),
3066 mapping,
3067 NULL);
3068}
3069
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070/* create or adjust a UnicodeTranslateError */
3071static void make_translate_exception(PyObject **exceptionObject,
3072 const Py_UNICODE *unicode, int size,
3073 int startpos, int endpos,
3074 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003076 if (*exceptionObject == NULL) {
3077 *exceptionObject = PyUnicodeTranslateError_Create(
3078 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079 }
3080 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003081 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3082 goto onError;
3083 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3084 goto onError;
3085 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3086 goto onError;
3087 return;
3088 onError:
3089 Py_DECREF(*exceptionObject);
3090 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 }
3092}
3093
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003094/* raises a UnicodeTranslateError */
3095static void raise_translate_exception(PyObject **exceptionObject,
3096 const Py_UNICODE *unicode, int size,
3097 int startpos, int endpos,
3098 const char *reason)
3099{
3100 make_translate_exception(exceptionObject,
3101 unicode, size, startpos, endpos, reason);
3102 if (*exceptionObject != NULL)
3103 PyCodec_StrictErrors(*exceptionObject);
3104}
3105
3106/* error handling callback helper:
3107 build arguments, call the callback and check the arguments,
3108 put the result into newpos and return the replacement string, which
3109 has to be freed by the caller */
3110static PyObject *unicode_translate_call_errorhandler(const char *errors,
3111 PyObject **errorHandler,
3112 const char *reason,
3113 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3114 int startpos, int endpos,
3115 int *newpos)
3116{
3117 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3118
3119 PyObject *restuple;
3120 PyObject *resunicode;
3121
3122 if (*errorHandler == NULL) {
3123 *errorHandler = PyCodec_LookupError(errors);
3124 if (*errorHandler == NULL)
3125 return NULL;
3126 }
3127
3128 make_translate_exception(exceptionObject,
3129 unicode, size, startpos, endpos, reason);
3130 if (*exceptionObject == NULL)
3131 return NULL;
3132
3133 restuple = PyObject_CallFunctionObjArgs(
3134 *errorHandler, *exceptionObject, NULL);
3135 if (restuple == NULL)
3136 return NULL;
3137 if (!PyTuple_Check(restuple)) {
3138 PyErr_Format(PyExc_TypeError, &argparse[4]);
3139 Py_DECREF(restuple);
3140 return NULL;
3141 }
3142 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3143 &resunicode, newpos)) {
3144 Py_DECREF(restuple);
3145 return NULL;
3146 }
3147 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003148 *newpos = size+*newpos;
3149 if (*newpos<0 || *newpos>size) {
3150 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3151 Py_DECREF(restuple);
3152 return NULL;
3153 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003154 Py_INCREF(resunicode);
3155 Py_DECREF(restuple);
3156 return resunicode;
3157}
3158
3159/* Lookup the character ch in the mapping and put the result in result,
3160 which must be decrefed by the caller.
3161 Return 0 on success, -1 on error */
3162static
3163int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3164{
3165 PyObject *w = PyInt_FromLong((long)c);
3166 PyObject *x;
3167
3168 if (w == NULL)
3169 return -1;
3170 x = PyObject_GetItem(mapping, w);
3171 Py_DECREF(w);
3172 if (x == NULL) {
3173 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3174 /* No mapping found means: use 1:1 mapping. */
3175 PyErr_Clear();
3176 *result = NULL;
3177 return 0;
3178 } else
3179 return -1;
3180 }
3181 else if (x == Py_None) {
3182 *result = x;
3183 return 0;
3184 }
3185 else if (PyInt_Check(x)) {
3186 long value = PyInt_AS_LONG(x);
3187 long max = PyUnicode_GetMax();
3188 if (value < 0 || value > max) {
3189 PyErr_Format(PyExc_TypeError,
3190 "character mapping must be in range(0x%lx)", max+1);
3191 Py_DECREF(x);
3192 return -1;
3193 }
3194 *result = x;
3195 return 0;
3196 }
3197 else if (PyUnicode_Check(x)) {
3198 *result = x;
3199 return 0;
3200 }
3201 else {
3202 /* wrong return value */
3203 PyErr_SetString(PyExc_TypeError,
3204 "character mapping must return integer, None or unicode");
3205 return -1;
3206 }
3207}
3208/* ensure that *outobj is at least requiredsize characters long,
3209if not reallocate and adjust various state variables.
3210Return 0 on success, -1 on error */
3211static
3212int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize,
3213 int requiredsize)
3214{
3215 if (requiredsize > *outsize) {
3216 /* remember old output position */
3217 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3218 /* exponentially overallocate to minimize reallocations */
3219 if (requiredsize < 2 * *outsize)
3220 requiredsize = 2 * *outsize;
3221 if (_PyUnicode_Resize(outobj, requiredsize))
3222 return -1;
3223 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3224 *outsize = requiredsize;
3225 }
3226 return 0;
3227}
3228/* lookup the character, put the result in the output string and adjust
3229 various state variables. Return a new reference to the object that
3230 was put in the output buffer in *result, or Py_None, if the mapping was
3231 undefined (in which case no character was written).
3232 The called must decref result.
3233 Return 0 on success, -1 on error. */
3234static
3235int charmaptranslate_output(Py_UNICODE c, PyObject *mapping,
3236 PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res)
3237{
3238 if (charmaptranslate_lookup(c, mapping, res))
3239 return -1;
3240 if (*res==NULL) {
3241 /* not found => default to 1:1 mapping */
3242 *(*outp)++ = (Py_UNICODE)c;
3243 }
3244 else if (*res==Py_None)
3245 ;
3246 else if (PyInt_Check(*res)) {
3247 /* no overflow check, because we know that the space is enough */
3248 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3249 }
3250 else if (PyUnicode_Check(*res)) {
3251 int repsize = PyUnicode_GET_SIZE(*res);
3252 if (repsize==1) {
3253 /* no overflow check, because we know that the space is enough */
3254 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3255 }
3256 else if (repsize!=0) {
3257 /* more than one character */
3258 int requiredsize = *outsize + repsize - 1;
3259 if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize))
3260 return -1;
3261 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3262 *outp += repsize;
3263 }
3264 }
3265 else
3266 return -1;
3267 return 0;
3268}
3269
3270PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 int size,
3272 PyObject *mapping,
3273 const char *errors)
3274{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003275 /* output object */
3276 PyObject *res = NULL;
3277 /* pointers to the beginning and end+1 of input */
3278 const Py_UNICODE *startp = p;
3279 const Py_UNICODE *endp = p + size;
3280 /* pointer into the output */
3281 Py_UNICODE *str;
3282 /* current output position */
3283 int respos = 0;
3284 int ressize;
3285 char *reason = "character maps to <undefined>";
3286 PyObject *errorHandler = NULL;
3287 PyObject *exc = NULL;
3288 /* the following variable is used for caching string comparisons
3289 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3290 * 3=ignore, 4=xmlcharrefreplace */
3291 int known_errorHandler = -1;
3292
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293 if (mapping == NULL) {
3294 PyErr_BadArgument();
3295 return NULL;
3296 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003297
3298 /* allocate enough for a simple 1:1 translation without
3299 replacements, if we need more, we'll resize */
3300 res = PyUnicode_FromUnicode(NULL, size);
3301 if (res == NULL)
3302 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003304 return res;
3305 str = PyUnicode_AS_UNICODE(res);
3306 ressize = size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003308 while (p<endp) {
3309 /* try to encode it */
3310 PyObject *x = NULL;
3311 if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) {
3312 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313 goto onError;
3314 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003315 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003316 if (x!=Py_None) /* it worked => adjust input pointer */
3317 ++p;
3318 else { /* untranslatable character */
3319 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3320 int repsize;
3321 int newpos;
3322 Py_UNICODE *uni2;
3323 /* startpos for collecting untranslatable chars */
3324 const Py_UNICODE *collstart = p;
3325 const Py_UNICODE *collend = p+1;
3326 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003328 /* find all untranslatable characters */
3329 while (collend < endp) {
3330 if (charmaptranslate_lookup(*collend, mapping, &x))
3331 goto onError;
3332 Py_XDECREF(x);
3333 if (x!=Py_None)
3334 break;
3335 ++collend;
3336 }
3337 /* cache callback name lookup
3338 * (if not done yet, i.e. it's the first error) */
3339 if (known_errorHandler==-1) {
3340 if ((errors==NULL) || (!strcmp(errors, "strict")))
3341 known_errorHandler = 1;
3342 else if (!strcmp(errors, "replace"))
3343 known_errorHandler = 2;
3344 else if (!strcmp(errors, "ignore"))
3345 known_errorHandler = 3;
3346 else if (!strcmp(errors, "xmlcharrefreplace"))
3347 known_errorHandler = 4;
3348 else
3349 known_errorHandler = 0;
3350 }
3351 switch (known_errorHandler) {
3352 case 1: /* strict */
3353 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3354 goto onError;
3355 case 2: /* replace */
3356 /* No need to check for space, this is a 1:1 replacement */
3357 for (coll = collstart; coll<collend; ++coll)
3358 *str++ = '?';
3359 /* fall through */
3360 case 3: /* ignore */
3361 p = collend;
3362 break;
3363 case 4: /* xmlcharrefreplace */
3364 /* generate replacement (temporarily (mis)uses p) */
3365 for (p = collstart; p < collend; ++p) {
3366 char buffer[2+29+1+1];
3367 char *cp;
3368 sprintf(buffer, "&#%d;", (int)*p);
3369 if (charmaptranslate_makespace(&res, &str, &ressize,
3370 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3371 goto onError;
3372 for (cp = buffer; *cp; ++cp)
3373 *str++ = *cp;
3374 }
3375 p = collend;
3376 break;
3377 default:
3378 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3379 reason, startp, size, &exc,
3380 collstart-startp, collend-startp, &newpos);
3381 if (repunicode == NULL)
3382 goto onError;
3383 /* generate replacement */
3384 repsize = PyUnicode_GET_SIZE(repunicode);
3385 if (charmaptranslate_makespace(&res, &str, &ressize,
3386 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3387 Py_DECREF(repunicode);
3388 goto onError;
3389 }
3390 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3391 *str++ = *uni2;
3392 p = startp + newpos;
3393 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003394 }
3395 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003396 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003397 /* Resize if we allocated to much */
3398 respos = str-PyUnicode_AS_UNICODE(res);
3399 if (respos<ressize) {
3400 if (_PyUnicode_Resize(&res, respos))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003401 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003402 }
3403 Py_XDECREF(exc);
3404 Py_XDECREF(errorHandler);
3405 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003406
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003407 onError:
3408 Py_XDECREF(res);
3409 Py_XDECREF(exc);
3410 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003411 return NULL;
3412}
3413
3414PyObject *PyUnicode_Translate(PyObject *str,
3415 PyObject *mapping,
3416 const char *errors)
3417{
3418 PyObject *result;
3419
3420 str = PyUnicode_FromObject(str);
3421 if (str == NULL)
3422 goto onError;
3423 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3424 PyUnicode_GET_SIZE(str),
3425 mapping,
3426 errors);
3427 Py_DECREF(str);
3428 return result;
3429
3430 onError:
3431 Py_XDECREF(str);
3432 return NULL;
3433}
3434
Guido van Rossum9e896b32000-04-05 20:11:21 +00003435/* --- Decimal Encoder ---------------------------------------------------- */
3436
3437int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3438 int length,
3439 char *output,
3440 const char *errors)
3441{
3442 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003443 PyObject *errorHandler = NULL;
3444 PyObject *exc = NULL;
3445 const char *encoding = "decimal";
3446 const char *reason = "invalid decimal Unicode string";
3447 /* the following variable is used for caching string comparisons
3448 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3449 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003450
3451 if (output == NULL) {
3452 PyErr_BadArgument();
3453 return -1;
3454 }
3455
3456 p = s;
3457 end = s + length;
3458 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003459 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003460 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003461 PyObject *repunicode;
3462 int repsize;
3463 int newpos;
3464 Py_UNICODE *uni2;
3465 Py_UNICODE *collstart;
3466 Py_UNICODE *collend;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003467
3468 if (Py_UNICODE_ISSPACE(ch)) {
3469 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003470 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003471 continue;
3472 }
3473 decimal = Py_UNICODE_TODECIMAL(ch);
3474 if (decimal >= 0) {
3475 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003476 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003477 continue;
3478 }
Guido van Rossumba477042000-04-06 18:18:10 +00003479 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003480 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003481 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003482 continue;
3483 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003484 /* All other characters are considered unencodable */
3485 collstart = p;
3486 collend = p+1;
3487 while (collend < end) {
3488 if ((0 < *collend && *collend < 256) ||
3489 !Py_UNICODE_ISSPACE(*collend) ||
3490 Py_UNICODE_TODECIMAL(*collend))
3491 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003492 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003493 /* cache callback name lookup
3494 * (if not done yet, i.e. it's the first error) */
3495 if (known_errorHandler==-1) {
3496 if ((errors==NULL) || (!strcmp(errors, "strict")))
3497 known_errorHandler = 1;
3498 else if (!strcmp(errors, "replace"))
3499 known_errorHandler = 2;
3500 else if (!strcmp(errors, "ignore"))
3501 known_errorHandler = 3;
3502 else if (!strcmp(errors, "xmlcharrefreplace"))
3503 known_errorHandler = 4;
3504 else
3505 known_errorHandler = 0;
3506 }
3507 switch (known_errorHandler) {
3508 case 1: /* strict */
3509 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3510 goto onError;
3511 case 2: /* replace */
3512 for (p = collstart; p < collend; ++p)
3513 *output++ = '?';
3514 /* fall through */
3515 case 3: /* ignore */
3516 p = collend;
3517 break;
3518 case 4: /* xmlcharrefreplace */
3519 /* generate replacement (temporarily (mis)uses p) */
3520 for (p = collstart; p < collend; ++p)
3521 output += sprintf(output, "&#%d;", (int)*p);
3522 p = collend;
3523 break;
3524 default:
3525 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3526 encoding, reason, s, length, &exc,
3527 collstart-s, collend-s, &newpos);
3528 if (repunicode == NULL)
3529 goto onError;
3530 /* generate replacement */
3531 repsize = PyUnicode_GET_SIZE(repunicode);
3532 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3533 Py_UNICODE ch = *uni2;
3534 if (Py_UNICODE_ISSPACE(ch))
3535 *output++ = ' ';
3536 else {
3537 decimal = Py_UNICODE_TODECIMAL(ch);
3538 if (decimal >= 0)
3539 *output++ = '0' + decimal;
3540 else if (0 < ch && ch < 256)
3541 *output++ = (char)ch;
3542 else {
3543 Py_DECREF(repunicode);
3544 raise_encode_exception(&exc, encoding,
3545 s, length, collstart-s, collend-s, reason);
3546 goto onError;
3547 }
3548 }
3549 }
3550 p = s + newpos;
3551 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003552 }
3553 }
3554 /* 0-terminate the output string */
3555 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556 Py_XDECREF(exc);
3557 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003558 return 0;
3559
3560 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561 Py_XDECREF(exc);
3562 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003563 return -1;
3564}
3565
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566/* --- Helpers ------------------------------------------------------------ */
3567
3568static
3569int count(PyUnicodeObject *self,
3570 int start,
3571 int end,
3572 PyUnicodeObject *substring)
3573{
3574 int count = 0;
3575
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003576 if (start < 0)
3577 start += self->length;
3578 if (start < 0)
3579 start = 0;
3580 if (end > self->length)
3581 end = self->length;
3582 if (end < 0)
3583 end += self->length;
3584 if (end < 0)
3585 end = 0;
3586
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003587 if (substring->length == 0)
3588 return (end - start + 1);
3589
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 end -= substring->length;
3591
3592 while (start <= end)
3593 if (Py_UNICODE_MATCH(self, start, substring)) {
3594 count++;
3595 start += substring->length;
3596 } else
3597 start++;
3598
3599 return count;
3600}
3601
3602int PyUnicode_Count(PyObject *str,
3603 PyObject *substr,
3604 int start,
3605 int end)
3606{
3607 int result;
3608
3609 str = PyUnicode_FromObject(str);
3610 if (str == NULL)
3611 return -1;
3612 substr = PyUnicode_FromObject(substr);
3613 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003614 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003615 return -1;
3616 }
3617
3618 result = count((PyUnicodeObject *)str,
3619 start, end,
3620 (PyUnicodeObject *)substr);
3621
3622 Py_DECREF(str);
3623 Py_DECREF(substr);
3624 return result;
3625}
3626
3627static
3628int findstring(PyUnicodeObject *self,
3629 PyUnicodeObject *substring,
3630 int start,
3631 int end,
3632 int direction)
3633{
3634 if (start < 0)
3635 start += self->length;
3636 if (start < 0)
3637 start = 0;
3638
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639 if (end > self->length)
3640 end = self->length;
3641 if (end < 0)
3642 end += self->length;
3643 if (end < 0)
3644 end = 0;
3645
Guido van Rossum76afbd92002-08-20 17:29:29 +00003646 if (substring->length == 0)
3647 return (direction > 0) ? start : end;
3648
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649 end -= substring->length;
3650
3651 if (direction < 0) {
3652 for (; end >= start; end--)
3653 if (Py_UNICODE_MATCH(self, end, substring))
3654 return end;
3655 } else {
3656 for (; start <= end; start++)
3657 if (Py_UNICODE_MATCH(self, start, substring))
3658 return start;
3659 }
3660
3661 return -1;
3662}
3663
3664int PyUnicode_Find(PyObject *str,
3665 PyObject *substr,
3666 int start,
3667 int end,
3668 int direction)
3669{
3670 int result;
3671
3672 str = PyUnicode_FromObject(str);
3673 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003674 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003675 substr = PyUnicode_FromObject(substr);
3676 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003677 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003678 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679 }
3680
3681 result = findstring((PyUnicodeObject *)str,
3682 (PyUnicodeObject *)substr,
3683 start, end, direction);
3684 Py_DECREF(str);
3685 Py_DECREF(substr);
3686 return result;
3687}
3688
3689static
3690int tailmatch(PyUnicodeObject *self,
3691 PyUnicodeObject *substring,
3692 int start,
3693 int end,
3694 int direction)
3695{
3696 if (start < 0)
3697 start += self->length;
3698 if (start < 0)
3699 start = 0;
3700
3701 if (substring->length == 0)
3702 return 1;
3703
3704 if (end > self->length)
3705 end = self->length;
3706 if (end < 0)
3707 end += self->length;
3708 if (end < 0)
3709 end = 0;
3710
3711 end -= substring->length;
3712 if (end < start)
3713 return 0;
3714
3715 if (direction > 0) {
3716 if (Py_UNICODE_MATCH(self, end, substring))
3717 return 1;
3718 } else {
3719 if (Py_UNICODE_MATCH(self, start, substring))
3720 return 1;
3721 }
3722
3723 return 0;
3724}
3725
3726int PyUnicode_Tailmatch(PyObject *str,
3727 PyObject *substr,
3728 int start,
3729 int end,
3730 int direction)
3731{
3732 int result;
3733
3734 str = PyUnicode_FromObject(str);
3735 if (str == NULL)
3736 return -1;
3737 substr = PyUnicode_FromObject(substr);
3738 if (substr == NULL) {
3739 Py_DECREF(substr);
3740 return -1;
3741 }
3742
3743 result = tailmatch((PyUnicodeObject *)str,
3744 (PyUnicodeObject *)substr,
3745 start, end, direction);
3746 Py_DECREF(str);
3747 Py_DECREF(substr);
3748 return result;
3749}
3750
3751static
3752const Py_UNICODE *findchar(const Py_UNICODE *s,
3753 int size,
3754 Py_UNICODE ch)
3755{
3756 /* like wcschr, but doesn't stop at NULL characters */
3757
3758 while (size-- > 0) {
3759 if (*s == ch)
3760 return s;
3761 s++;
3762 }
3763
3764 return NULL;
3765}
3766
3767/* Apply fixfct filter to the Unicode object self and return a
3768 reference to the modified object */
3769
3770static
3771PyObject *fixup(PyUnicodeObject *self,
3772 int (*fixfct)(PyUnicodeObject *s))
3773{
3774
3775 PyUnicodeObject *u;
3776
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003777 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778 if (u == NULL)
3779 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003780
3781 Py_UNICODE_COPY(u->str, self->str, self->length);
3782
Tim Peters7a29bd52001-09-12 03:03:31 +00003783 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 /* fixfct should return TRUE if it modified the buffer. If
3785 FALSE, return a reference to the original buffer instead
3786 (to save space, not time) */
3787 Py_INCREF(self);
3788 Py_DECREF(u);
3789 return (PyObject*) self;
3790 }
3791 return (PyObject*) u;
3792}
3793
3794static
3795int fixupper(PyUnicodeObject *self)
3796{
3797 int len = self->length;
3798 Py_UNICODE *s = self->str;
3799 int status = 0;
3800
3801 while (len-- > 0) {
3802 register Py_UNICODE ch;
3803
3804 ch = Py_UNICODE_TOUPPER(*s);
3805 if (ch != *s) {
3806 status = 1;
3807 *s = ch;
3808 }
3809 s++;
3810 }
3811
3812 return status;
3813}
3814
3815static
3816int fixlower(PyUnicodeObject *self)
3817{
3818 int len = self->length;
3819 Py_UNICODE *s = self->str;
3820 int status = 0;
3821
3822 while (len-- > 0) {
3823 register Py_UNICODE ch;
3824
3825 ch = Py_UNICODE_TOLOWER(*s);
3826 if (ch != *s) {
3827 status = 1;
3828 *s = ch;
3829 }
3830 s++;
3831 }
3832
3833 return status;
3834}
3835
3836static
3837int fixswapcase(PyUnicodeObject *self)
3838{
3839 int len = self->length;
3840 Py_UNICODE *s = self->str;
3841 int status = 0;
3842
3843 while (len-- > 0) {
3844 if (Py_UNICODE_ISUPPER(*s)) {
3845 *s = Py_UNICODE_TOLOWER(*s);
3846 status = 1;
3847 } else if (Py_UNICODE_ISLOWER(*s)) {
3848 *s = Py_UNICODE_TOUPPER(*s);
3849 status = 1;
3850 }
3851 s++;
3852 }
3853
3854 return status;
3855}
3856
3857static
3858int fixcapitalize(PyUnicodeObject *self)
3859{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003860 int len = self->length;
3861 Py_UNICODE *s = self->str;
3862 int status = 0;
3863
3864 if (len == 0)
3865 return 0;
3866 if (Py_UNICODE_ISLOWER(*s)) {
3867 *s = Py_UNICODE_TOUPPER(*s);
3868 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003870 s++;
3871 while (--len > 0) {
3872 if (Py_UNICODE_ISUPPER(*s)) {
3873 *s = Py_UNICODE_TOLOWER(*s);
3874 status = 1;
3875 }
3876 s++;
3877 }
3878 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003879}
3880
3881static
3882int fixtitle(PyUnicodeObject *self)
3883{
3884 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3885 register Py_UNICODE *e;
3886 int previous_is_cased;
3887
3888 /* Shortcut for single character strings */
3889 if (PyUnicode_GET_SIZE(self) == 1) {
3890 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3891 if (*p != ch) {
3892 *p = ch;
3893 return 1;
3894 }
3895 else
3896 return 0;
3897 }
3898
3899 e = p + PyUnicode_GET_SIZE(self);
3900 previous_is_cased = 0;
3901 for (; p < e; p++) {
3902 register const Py_UNICODE ch = *p;
3903
3904 if (previous_is_cased)
3905 *p = Py_UNICODE_TOLOWER(ch);
3906 else
3907 *p = Py_UNICODE_TOTITLE(ch);
3908
3909 if (Py_UNICODE_ISLOWER(ch) ||
3910 Py_UNICODE_ISUPPER(ch) ||
3911 Py_UNICODE_ISTITLE(ch))
3912 previous_is_cased = 1;
3913 else
3914 previous_is_cased = 0;
3915 }
3916 return 1;
3917}
3918
3919PyObject *PyUnicode_Join(PyObject *separator,
3920 PyObject *seq)
3921{
3922 Py_UNICODE *sep;
3923 int seplen;
3924 PyUnicodeObject *res = NULL;
3925 int reslen = 0;
3926 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003927 int sz = 100;
3928 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003929 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930
Tim Peters2cfe3682001-05-05 05:36:48 +00003931 it = PyObject_GetIter(seq);
3932 if (it == NULL)
3933 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003934
3935 if (separator == NULL) {
3936 Py_UNICODE blank = ' ';
3937 sep = &blank;
3938 seplen = 1;
3939 }
3940 else {
3941 separator = PyUnicode_FromObject(separator);
3942 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003943 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944 sep = PyUnicode_AS_UNICODE(separator);
3945 seplen = PyUnicode_GET_SIZE(separator);
3946 }
3947
3948 res = _PyUnicode_New(sz);
3949 if (res == NULL)
3950 goto onError;
3951 p = PyUnicode_AS_UNICODE(res);
3952 reslen = 0;
3953
Tim Peters2cfe3682001-05-05 05:36:48 +00003954 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003955 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003956 PyObject *item = PyIter_Next(it);
3957 if (item == NULL) {
3958 if (PyErr_Occurred())
3959 goto onError;
3960 break;
3961 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962 if (!PyUnicode_Check(item)) {
3963 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003964 if (!PyString_Check(item)) {
3965 PyErr_Format(PyExc_TypeError,
3966 "sequence item %i: expected string or Unicode,"
3967 " %.80s found",
3968 i, item->ob_type->tp_name);
3969 Py_DECREF(item);
3970 goto onError;
3971 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003972 v = PyUnicode_FromObject(item);
3973 Py_DECREF(item);
3974 item = v;
3975 if (item == NULL)
3976 goto onError;
3977 }
3978 itemlen = PyUnicode_GET_SIZE(item);
3979 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003980 if (_PyUnicode_Resize(&res, sz*2)) {
3981 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984 sz *= 2;
3985 p = PyUnicode_AS_UNICODE(res) + reslen;
3986 }
3987 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003988 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989 p += seplen;
3990 reslen += seplen;
3991 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003992 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993 p += itemlen;
3994 reslen += itemlen;
3995 Py_DECREF(item);
3996 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003997 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 goto onError;
3999
4000 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004001 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004002 return (PyObject *)res;
4003
4004 onError:
4005 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004006 Py_XDECREF(res);
4007 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 return NULL;
4009}
4010
4011static
4012PyUnicodeObject *pad(PyUnicodeObject *self,
4013 int left,
4014 int right,
4015 Py_UNICODE fill)
4016{
4017 PyUnicodeObject *u;
4018
4019 if (left < 0)
4020 left = 0;
4021 if (right < 0)
4022 right = 0;
4023
Tim Peters7a29bd52001-09-12 03:03:31 +00004024 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004025 Py_INCREF(self);
4026 return self;
4027 }
4028
4029 u = _PyUnicode_New(left + self->length + right);
4030 if (u) {
4031 if (left)
4032 Py_UNICODE_FILL(u->str, fill, left);
4033 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4034 if (right)
4035 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4036 }
4037
4038 return u;
4039}
4040
4041#define SPLIT_APPEND(data, left, right) \
4042 str = PyUnicode_FromUnicode(data + left, right - left); \
4043 if (!str) \
4044 goto onError; \
4045 if (PyList_Append(list, str)) { \
4046 Py_DECREF(str); \
4047 goto onError; \
4048 } \
4049 else \
4050 Py_DECREF(str);
4051
4052static
4053PyObject *split_whitespace(PyUnicodeObject *self,
4054 PyObject *list,
4055 int maxcount)
4056{
4057 register int i;
4058 register int j;
4059 int len = self->length;
4060 PyObject *str;
4061
4062 for (i = j = 0; i < len; ) {
4063 /* find a token */
4064 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4065 i++;
4066 j = i;
4067 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4068 i++;
4069 if (j < i) {
4070 if (maxcount-- <= 0)
4071 break;
4072 SPLIT_APPEND(self->str, j, i);
4073 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4074 i++;
4075 j = i;
4076 }
4077 }
4078 if (j < len) {
4079 SPLIT_APPEND(self->str, j, len);
4080 }
4081 return list;
4082
4083 onError:
4084 Py_DECREF(list);
4085 return NULL;
4086}
4087
4088PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004089 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090{
4091 register int i;
4092 register int j;
4093 int len;
4094 PyObject *list;
4095 PyObject *str;
4096 Py_UNICODE *data;
4097
4098 string = PyUnicode_FromObject(string);
4099 if (string == NULL)
4100 return NULL;
4101 data = PyUnicode_AS_UNICODE(string);
4102 len = PyUnicode_GET_SIZE(string);
4103
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104 list = PyList_New(0);
4105 if (!list)
4106 goto onError;
4107
4108 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004109 int eol;
4110
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111 /* Find a line and append it */
4112 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4113 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004114
4115 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004116 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117 if (i < len) {
4118 if (data[i] == '\r' && i + 1 < len &&
4119 data[i+1] == '\n')
4120 i += 2;
4121 else
4122 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004123 if (keepends)
4124 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 }
Guido van Rossum86662912000-04-11 15:38:46 +00004126 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127 j = i;
4128 }
4129 if (j < len) {
4130 SPLIT_APPEND(data, j, len);
4131 }
4132
4133 Py_DECREF(string);
4134 return list;
4135
4136 onError:
4137 Py_DECREF(list);
4138 Py_DECREF(string);
4139 return NULL;
4140}
4141
4142static
4143PyObject *split_char(PyUnicodeObject *self,
4144 PyObject *list,
4145 Py_UNICODE ch,
4146 int maxcount)
4147{
4148 register int i;
4149 register int j;
4150 int len = self->length;
4151 PyObject *str;
4152
4153 for (i = j = 0; i < len; ) {
4154 if (self->str[i] == ch) {
4155 if (maxcount-- <= 0)
4156 break;
4157 SPLIT_APPEND(self->str, j, i);
4158 i = j = i + 1;
4159 } else
4160 i++;
4161 }
4162 if (j <= len) {
4163 SPLIT_APPEND(self->str, j, len);
4164 }
4165 return list;
4166
4167 onError:
4168 Py_DECREF(list);
4169 return NULL;
4170}
4171
4172static
4173PyObject *split_substring(PyUnicodeObject *self,
4174 PyObject *list,
4175 PyUnicodeObject *substring,
4176 int maxcount)
4177{
4178 register int i;
4179 register int j;
4180 int len = self->length;
4181 int sublen = substring->length;
4182 PyObject *str;
4183
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004184 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185 if (Py_UNICODE_MATCH(self, i, substring)) {
4186 if (maxcount-- <= 0)
4187 break;
4188 SPLIT_APPEND(self->str, j, i);
4189 i = j = i + sublen;
4190 } else
4191 i++;
4192 }
4193 if (j <= len) {
4194 SPLIT_APPEND(self->str, j, len);
4195 }
4196 return list;
4197
4198 onError:
4199 Py_DECREF(list);
4200 return NULL;
4201}
4202
4203#undef SPLIT_APPEND
4204
4205static
4206PyObject *split(PyUnicodeObject *self,
4207 PyUnicodeObject *substring,
4208 int maxcount)
4209{
4210 PyObject *list;
4211
4212 if (maxcount < 0)
4213 maxcount = INT_MAX;
4214
4215 list = PyList_New(0);
4216 if (!list)
4217 return NULL;
4218
4219 if (substring == NULL)
4220 return split_whitespace(self,list,maxcount);
4221
4222 else if (substring->length == 1)
4223 return split_char(self,list,substring->str[0],maxcount);
4224
4225 else if (substring->length == 0) {
4226 Py_DECREF(list);
4227 PyErr_SetString(PyExc_ValueError, "empty separator");
4228 return NULL;
4229 }
4230 else
4231 return split_substring(self,list,substring,maxcount);
4232}
4233
4234static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235PyObject *replace(PyUnicodeObject *self,
4236 PyUnicodeObject *str1,
4237 PyUnicodeObject *str2,
4238 int maxcount)
4239{
4240 PyUnicodeObject *u;
4241
4242 if (maxcount < 0)
4243 maxcount = INT_MAX;
4244
4245 if (str1->length == 1 && str2->length == 1) {
4246 int i;
4247
4248 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004249 if (!findchar(self->str, self->length, str1->str[0]) &&
4250 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004251 /* nothing to replace, return original string */
4252 Py_INCREF(self);
4253 u = self;
4254 } else {
4255 Py_UNICODE u1 = str1->str[0];
4256 Py_UNICODE u2 = str2->str[0];
4257
4258 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004259 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260 self->length
4261 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004262 if (u != NULL) {
4263 Py_UNICODE_COPY(u->str, self->str,
4264 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004265 for (i = 0; i < u->length; i++)
4266 if (u->str[i] == u1) {
4267 if (--maxcount < 0)
4268 break;
4269 u->str[i] = u2;
4270 }
4271 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004272 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273
4274 } else {
4275 int n, i;
4276 Py_UNICODE *p;
4277
4278 /* replace strings */
4279 n = count(self, 0, self->length, str1);
4280 if (n > maxcount)
4281 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004282 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004284 if (PyUnicode_CheckExact(self)) {
4285 Py_INCREF(self);
4286 u = self;
4287 }
4288 else {
4289 u = (PyUnicodeObject *)
4290 PyUnicode_FromUnicode(self->str, self->length);
4291 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292 } else {
4293 u = _PyUnicode_New(
4294 self->length + n * (str2->length - str1->length));
4295 if (u) {
4296 i = 0;
4297 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004298 if (str1->length > 0) {
4299 while (i <= self->length - str1->length)
4300 if (Py_UNICODE_MATCH(self, i, str1)) {
4301 /* replace string segment */
4302 Py_UNICODE_COPY(p, str2->str, str2->length);
4303 p += str2->length;
4304 i += str1->length;
4305 if (--n <= 0) {
4306 /* copy remaining part */
4307 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4308 break;
4309 }
4310 } else
4311 *p++ = self->str[i++];
4312 } else {
4313 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004314 Py_UNICODE_COPY(p, str2->str, str2->length);
4315 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004316 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004317 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004319 }
4320 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4321 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322 }
4323 }
4324 }
4325
4326 return (PyObject *) u;
4327}
4328
4329/* --- Unicode Object Methods --------------------------------------------- */
4330
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004331PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332"S.title() -> unicode\n\
4333\n\
4334Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004335characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336
4337static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004338unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340 return fixup(self, fixtitle);
4341}
4342
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004343PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004344"S.capitalize() -> unicode\n\
4345\n\
4346Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004347have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004348
4349static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004350unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004352 return fixup(self, fixcapitalize);
4353}
4354
4355#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004356PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357"S.capwords() -> unicode\n\
4358\n\
4359Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004360normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361
4362static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004363unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364{
4365 PyObject *list;
4366 PyObject *item;
4367 int i;
4368
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369 /* Split into words */
4370 list = split(self, NULL, -1);
4371 if (!list)
4372 return NULL;
4373
4374 /* Capitalize each word */
4375 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4376 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4377 fixcapitalize);
4378 if (item == NULL)
4379 goto onError;
4380 Py_DECREF(PyList_GET_ITEM(list, i));
4381 PyList_SET_ITEM(list, i, item);
4382 }
4383
4384 /* Join the words to form a new string */
4385 item = PyUnicode_Join(NULL, list);
4386
4387onError:
4388 Py_DECREF(list);
4389 return (PyObject *)item;
4390}
4391#endif
4392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004393PyDoc_STRVAR(center__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394"S.center(width) -> unicode\n\
4395\n\
4396Return S centered in a Unicode string of length width. Padding is done\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004397using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398
4399static PyObject *
4400unicode_center(PyUnicodeObject *self, PyObject *args)
4401{
4402 int marg, left;
4403 int width;
4404
4405 if (!PyArg_ParseTuple(args, "i:center", &width))
4406 return NULL;
4407
Tim Peters7a29bd52001-09-12 03:03:31 +00004408 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409 Py_INCREF(self);
4410 return (PyObject*) self;
4411 }
4412
4413 marg = width - self->length;
4414 left = marg / 2 + (marg & width & 1);
4415
4416 return (PyObject*) pad(self, left, marg - left, ' ');
4417}
4418
Marc-André Lemburge5034372000-08-08 08:04:29 +00004419#if 0
4420
4421/* This code should go into some future Unicode collation support
4422 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004423 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004424
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004425/* speedy UTF-16 code point order comparison */
4426/* gleaned from: */
4427/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4428
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004429static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004430{
4431 0, 0, 0, 0, 0, 0, 0, 0,
4432 0, 0, 0, 0, 0, 0, 0, 0,
4433 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004434 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004435};
4436
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437static int
4438unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4439{
4440 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004441
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442 Py_UNICODE *s1 = str1->str;
4443 Py_UNICODE *s2 = str2->str;
4444
4445 len1 = str1->length;
4446 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004447
Guido van Rossumd57fd912000-03-10 22:53:23 +00004448 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004449 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004450
4451 c1 = *s1++;
4452 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004453
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004454 if (c1 > (1<<11) * 26)
4455 c1 += utf16Fixup[c1>>11];
4456 if (c2 > (1<<11) * 26)
4457 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004458 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004459
4460 if (c1 != c2)
4461 return (c1 < c2) ? -1 : 1;
4462
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004463 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004464 }
4465
4466 return (len1 < len2) ? -1 : (len1 != len2);
4467}
4468
Marc-André Lemburge5034372000-08-08 08:04:29 +00004469#else
4470
4471static int
4472unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4473{
4474 register int len1, len2;
4475
4476 Py_UNICODE *s1 = str1->str;
4477 Py_UNICODE *s2 = str2->str;
4478
4479 len1 = str1->length;
4480 len2 = str2->length;
4481
4482 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00004483 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004484
Fredrik Lundh45714e92001-06-26 16:39:36 +00004485 c1 = *s1++;
4486 c2 = *s2++;
4487
4488 if (c1 != c2)
4489 return (c1 < c2) ? -1 : 1;
4490
Marc-André Lemburge5034372000-08-08 08:04:29 +00004491 len1--; len2--;
4492 }
4493
4494 return (len1 < len2) ? -1 : (len1 != len2);
4495}
4496
4497#endif
4498
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499int PyUnicode_Compare(PyObject *left,
4500 PyObject *right)
4501{
4502 PyUnicodeObject *u = NULL, *v = NULL;
4503 int result;
4504
4505 /* Coerce the two arguments */
4506 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4507 if (u == NULL)
4508 goto onError;
4509 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4510 if (v == NULL)
4511 goto onError;
4512
Thomas Wouters7e474022000-07-16 12:04:32 +00004513 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004514 if (v == u) {
4515 Py_DECREF(u);
4516 Py_DECREF(v);
4517 return 0;
4518 }
4519
4520 result = unicode_compare(u, v);
4521
4522 Py_DECREF(u);
4523 Py_DECREF(v);
4524 return result;
4525
4526onError:
4527 Py_XDECREF(u);
4528 Py_XDECREF(v);
4529 return -1;
4530}
4531
Guido van Rossum403d68b2000-03-13 15:55:09 +00004532int PyUnicode_Contains(PyObject *container,
4533 PyObject *element)
4534{
4535 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004536 int result, size;
4537 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004538
4539 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004540 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004541 if (v == NULL) {
4542 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004543 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004544 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004545 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004546 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004547 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004548 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004549
Barry Warsaw817918c2002-08-06 16:58:21 +00004550 size = PyUnicode_GET_SIZE(v);
4551 rhs = PyUnicode_AS_UNICODE(v);
4552 lhs = PyUnicode_AS_UNICODE(u);
4553
Guido van Rossum403d68b2000-03-13 15:55:09 +00004554 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004555 if (size == 1) {
4556 end = lhs + PyUnicode_GET_SIZE(u);
4557 while (lhs < end) {
4558 if (*lhs++ == *rhs) {
4559 result = 1;
4560 break;
4561 }
4562 }
4563 }
4564 else {
4565 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4566 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004567 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004568 result = 1;
4569 break;
4570 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004571 }
4572 }
4573
4574 Py_DECREF(u);
4575 Py_DECREF(v);
4576 return result;
4577
4578onError:
4579 Py_XDECREF(u);
4580 Py_XDECREF(v);
4581 return -1;
4582}
4583
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584/* Concat to string or Unicode object giving a new Unicode object. */
4585
4586PyObject *PyUnicode_Concat(PyObject *left,
4587 PyObject *right)
4588{
4589 PyUnicodeObject *u = NULL, *v = NULL, *w;
4590
4591 /* Coerce the two arguments */
4592 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4593 if (u == NULL)
4594 goto onError;
4595 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4596 if (v == NULL)
4597 goto onError;
4598
4599 /* Shortcuts */
4600 if (v == unicode_empty) {
4601 Py_DECREF(v);
4602 return (PyObject *)u;
4603 }
4604 if (u == unicode_empty) {
4605 Py_DECREF(u);
4606 return (PyObject *)v;
4607 }
4608
4609 /* Concat the two Unicode strings */
4610 w = _PyUnicode_New(u->length + v->length);
4611 if (w == NULL)
4612 goto onError;
4613 Py_UNICODE_COPY(w->str, u->str, u->length);
4614 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4615
4616 Py_DECREF(u);
4617 Py_DECREF(v);
4618 return (PyObject *)w;
4619
4620onError:
4621 Py_XDECREF(u);
4622 Py_XDECREF(v);
4623 return NULL;
4624}
4625
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004626PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627"S.count(sub[, start[, end]]) -> int\n\
4628\n\
4629Return the number of occurrences of substring sub in Unicode string\n\
4630S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004631interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004632
4633static PyObject *
4634unicode_count(PyUnicodeObject *self, PyObject *args)
4635{
4636 PyUnicodeObject *substring;
4637 int start = 0;
4638 int end = INT_MAX;
4639 PyObject *result;
4640
Guido van Rossumb8872e62000-05-09 14:14:27 +00004641 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4642 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004643 return NULL;
4644
4645 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4646 (PyObject *)substring);
4647 if (substring == NULL)
4648 return NULL;
4649
Guido van Rossumd57fd912000-03-10 22:53:23 +00004650 if (start < 0)
4651 start += self->length;
4652 if (start < 0)
4653 start = 0;
4654 if (end > self->length)
4655 end = self->length;
4656 if (end < 0)
4657 end += self->length;
4658 if (end < 0)
4659 end = 0;
4660
4661 result = PyInt_FromLong((long) count(self, start, end, substring));
4662
4663 Py_DECREF(substring);
4664 return result;
4665}
4666
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004667PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004668"S.encode([encoding[,errors]]) -> string\n\
4669\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004670Return an encoded string version of S. Default encoding is the current\n\
4671default string encoding. errors may be given to set a different error\n\
4672handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004673a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4674'xmlcharrefreplace' as well as any other name registered with\n\
4675codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004676
4677static PyObject *
4678unicode_encode(PyUnicodeObject *self, PyObject *args)
4679{
4680 char *encoding = NULL;
4681 char *errors = NULL;
4682 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4683 return NULL;
4684 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4685}
4686
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004687PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688"S.expandtabs([tabsize]) -> unicode\n\
4689\n\
4690Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004691If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692
4693static PyObject*
4694unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4695{
4696 Py_UNICODE *e;
4697 Py_UNICODE *p;
4698 Py_UNICODE *q;
4699 int i, j;
4700 PyUnicodeObject *u;
4701 int tabsize = 8;
4702
4703 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4704 return NULL;
4705
Thomas Wouters7e474022000-07-16 12:04:32 +00004706 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707 i = j = 0;
4708 e = self->str + self->length;
4709 for (p = self->str; p < e; p++)
4710 if (*p == '\t') {
4711 if (tabsize > 0)
4712 j += tabsize - (j % tabsize);
4713 }
4714 else {
4715 j++;
4716 if (*p == '\n' || *p == '\r') {
4717 i += j;
4718 j = 0;
4719 }
4720 }
4721
4722 /* Second pass: create output string and fill it */
4723 u = _PyUnicode_New(i + j);
4724 if (!u)
4725 return NULL;
4726
4727 j = 0;
4728 q = u->str;
4729
4730 for (p = self->str; p < e; p++)
4731 if (*p == '\t') {
4732 if (tabsize > 0) {
4733 i = tabsize - (j % tabsize);
4734 j += i;
4735 while (i--)
4736 *q++ = ' ';
4737 }
4738 }
4739 else {
4740 j++;
4741 *q++ = *p;
4742 if (*p == '\n' || *p == '\r')
4743 j = 0;
4744 }
4745
4746 return (PyObject*) u;
4747}
4748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004749PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750"S.find(sub [,start [,end]]) -> int\n\
4751\n\
4752Return the lowest index in S where substring sub is found,\n\
4753such that sub is contained within s[start,end]. Optional\n\
4754arguments start and end are interpreted as in slice notation.\n\
4755\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004756Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757
4758static PyObject *
4759unicode_find(PyUnicodeObject *self, PyObject *args)
4760{
4761 PyUnicodeObject *substring;
4762 int start = 0;
4763 int end = INT_MAX;
4764 PyObject *result;
4765
Guido van Rossumb8872e62000-05-09 14:14:27 +00004766 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4767 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768 return NULL;
4769 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4770 (PyObject *)substring);
4771 if (substring == NULL)
4772 return NULL;
4773
4774 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4775
4776 Py_DECREF(substring);
4777 return result;
4778}
4779
4780static PyObject *
4781unicode_getitem(PyUnicodeObject *self, int index)
4782{
4783 if (index < 0 || index >= self->length) {
4784 PyErr_SetString(PyExc_IndexError, "string index out of range");
4785 return NULL;
4786 }
4787
4788 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4789}
4790
4791static long
4792unicode_hash(PyUnicodeObject *self)
4793{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004794 /* Since Unicode objects compare equal to their ASCII string
4795 counterparts, they should use the individual character values
4796 as basis for their hash value. This is needed to assure that
4797 strings and Unicode objects behave in the same way as
4798 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799
Fredrik Lundhdde61642000-07-10 18:27:47 +00004800 register int len;
4801 register Py_UNICODE *p;
4802 register long x;
4803
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804 if (self->hash != -1)
4805 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004806 len = PyUnicode_GET_SIZE(self);
4807 p = PyUnicode_AS_UNICODE(self);
4808 x = *p << 7;
4809 while (--len >= 0)
4810 x = (1000003*x) ^ *p++;
4811 x ^= PyUnicode_GET_SIZE(self);
4812 if (x == -1)
4813 x = -2;
4814 self->hash = x;
4815 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816}
4817
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004818PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819"S.index(sub [,start [,end]]) -> int\n\
4820\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004821Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822
4823static PyObject *
4824unicode_index(PyUnicodeObject *self, PyObject *args)
4825{
4826 int result;
4827 PyUnicodeObject *substring;
4828 int start = 0;
4829 int end = INT_MAX;
4830
Guido van Rossumb8872e62000-05-09 14:14:27 +00004831 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4832 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833 return NULL;
4834
4835 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4836 (PyObject *)substring);
4837 if (substring == NULL)
4838 return NULL;
4839
4840 result = findstring(self, substring, start, end, 1);
4841
4842 Py_DECREF(substring);
4843 if (result < 0) {
4844 PyErr_SetString(PyExc_ValueError, "substring not found");
4845 return NULL;
4846 }
4847 return PyInt_FromLong(result);
4848}
4849
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004850PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004851"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004853Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004854at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855
4856static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004857unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858{
4859 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4860 register const Py_UNICODE *e;
4861 int cased;
4862
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863 /* Shortcut for single character strings */
4864 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004865 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004867 /* Special case for empty strings */
4868 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004869 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004870
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871 e = p + PyUnicode_GET_SIZE(self);
4872 cased = 0;
4873 for (; p < e; p++) {
4874 register const Py_UNICODE ch = *p;
4875
4876 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004877 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878 else if (!cased && Py_UNICODE_ISLOWER(ch))
4879 cased = 1;
4880 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004881 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882}
4883
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004884PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004885"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004887Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004888at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889
4890static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004891unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892{
4893 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4894 register const Py_UNICODE *e;
4895 int cased;
4896
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897 /* Shortcut for single character strings */
4898 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004899 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004901 /* Special case for empty strings */
4902 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004903 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004904
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905 e = p + PyUnicode_GET_SIZE(self);
4906 cased = 0;
4907 for (; p < e; p++) {
4908 register const Py_UNICODE ch = *p;
4909
4910 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004911 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912 else if (!cased && Py_UNICODE_ISUPPER(ch))
4913 cased = 1;
4914 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004915 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916}
4917
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004918PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004919"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004921Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4922characters may only follow uncased characters and lowercase characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004923only cased ones. Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924
4925static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004926unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927{
4928 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4929 register const Py_UNICODE *e;
4930 int cased, previous_is_cased;
4931
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932 /* Shortcut for single character strings */
4933 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004934 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4935 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004937 /* Special case for empty strings */
4938 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004939 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004940
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941 e = p + PyUnicode_GET_SIZE(self);
4942 cased = 0;
4943 previous_is_cased = 0;
4944 for (; p < e; p++) {
4945 register const Py_UNICODE ch = *p;
4946
4947 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4948 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004949 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950 previous_is_cased = 1;
4951 cased = 1;
4952 }
4953 else if (Py_UNICODE_ISLOWER(ch)) {
4954 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004955 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956 previous_is_cased = 1;
4957 cased = 1;
4958 }
4959 else
4960 previous_is_cased = 0;
4961 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004962 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963}
4964
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004965PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004966"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004967\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004968Return True if there are only whitespace characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004969False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004970
4971static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004972unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973{
4974 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4975 register const Py_UNICODE *e;
4976
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977 /* Shortcut for single character strings */
4978 if (PyUnicode_GET_SIZE(self) == 1 &&
4979 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004980 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004981
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004982 /* Special case for empty strings */
4983 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004984 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004985
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986 e = p + PyUnicode_GET_SIZE(self);
4987 for (; p < e; p++) {
4988 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004989 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004991 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004992}
4993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004994PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004995"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004996\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004997Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004998and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004999
5000static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005001unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005002{
5003 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5004 register const Py_UNICODE *e;
5005
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005006 /* Shortcut for single character strings */
5007 if (PyUnicode_GET_SIZE(self) == 1 &&
5008 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005009 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005010
5011 /* Special case for empty strings */
5012 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005013 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005014
5015 e = p + PyUnicode_GET_SIZE(self);
5016 for (; p < e; p++) {
5017 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005018 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005019 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005020 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005021}
5022
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005023PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005024"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005025\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005026Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005027and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005028
5029static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005030unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005031{
5032 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5033 register const Py_UNICODE *e;
5034
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005035 /* Shortcut for single character strings */
5036 if (PyUnicode_GET_SIZE(self) == 1 &&
5037 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005038 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005039
5040 /* Special case for empty strings */
5041 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005042 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005043
5044 e = p + PyUnicode_GET_SIZE(self);
5045 for (; p < e; p++) {
5046 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005047 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005048 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005049 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005050}
5051
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005052PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005053"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005054\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005055Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005056False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057
5058static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005059unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060{
5061 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5062 register const Py_UNICODE *e;
5063
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064 /* Shortcut for single character strings */
5065 if (PyUnicode_GET_SIZE(self) == 1 &&
5066 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005067 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005069 /* Special case for empty strings */
5070 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005071 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005072
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073 e = p + PyUnicode_GET_SIZE(self);
5074 for (; p < e; p++) {
5075 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005076 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005077 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005078 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079}
5080
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005081PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005082"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005084Return True if there are only digit characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005085False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086
5087static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005088unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089{
5090 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5091 register const Py_UNICODE *e;
5092
Guido van Rossumd57fd912000-03-10 22:53:23 +00005093 /* Shortcut for single character strings */
5094 if (PyUnicode_GET_SIZE(self) == 1 &&
5095 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005096 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005098 /* Special case for empty strings */
5099 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005100 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005101
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 e = p + PyUnicode_GET_SIZE(self);
5103 for (; p < e; p++) {
5104 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005105 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005107 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108}
5109
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005110PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005111"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005113Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005114False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115
5116static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005117unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118{
5119 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5120 register const Py_UNICODE *e;
5121
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122 /* Shortcut for single character strings */
5123 if (PyUnicode_GET_SIZE(self) == 1 &&
5124 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005125 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005127 /* Special case for empty strings */
5128 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005129 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005130
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131 e = p + PyUnicode_GET_SIZE(self);
5132 for (; p < e; p++) {
5133 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005134 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005136 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137}
5138
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005139PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140"S.join(sequence) -> unicode\n\
5141\n\
5142Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005143sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144
5145static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005146unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005148 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149}
5150
5151static int
5152unicode_length(PyUnicodeObject *self)
5153{
5154 return self->length;
5155}
5156
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005157PyDoc_STRVAR(ljust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158"S.ljust(width) -> unicode\n\
5159\n\
5160Return S left justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005161done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162
5163static PyObject *
5164unicode_ljust(PyUnicodeObject *self, PyObject *args)
5165{
5166 int width;
5167 if (!PyArg_ParseTuple(args, "i:ljust", &width))
5168 return NULL;
5169
Tim Peters7a29bd52001-09-12 03:03:31 +00005170 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171 Py_INCREF(self);
5172 return (PyObject*) self;
5173 }
5174
5175 return (PyObject*) pad(self, 0, width - self->length, ' ');
5176}
5177
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005178PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179"S.lower() -> unicode\n\
5180\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005181Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182
5183static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005184unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186 return fixup(self, fixlower);
5187}
5188
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005189#define LEFTSTRIP 0
5190#define RIGHTSTRIP 1
5191#define BOTHSTRIP 2
5192
5193/* Arrays indexed by above */
5194static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5195
5196#define STRIPNAME(i) (stripformat[i]+3)
5197
5198static const Py_UNICODE *
5199unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5200{
Tim Peters030a5ce2002-04-22 19:00:10 +00005201 size_t i;
5202 for (i = 0; i < n; ++i)
5203 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005204 return s+i;
5205 return NULL;
5206}
5207
5208/* externally visible for str.strip(unicode) */
5209PyObject *
5210_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5211{
5212 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5213 int len = PyUnicode_GET_SIZE(self);
5214 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5215 int seplen = PyUnicode_GET_SIZE(sepobj);
5216 int i, j;
5217
5218 i = 0;
5219 if (striptype != RIGHTSTRIP) {
5220 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5221 i++;
5222 }
5223 }
5224
5225 j = len;
5226 if (striptype != LEFTSTRIP) {
5227 do {
5228 j--;
5229 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5230 j++;
5231 }
5232
5233 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5234 Py_INCREF(self);
5235 return (PyObject*)self;
5236 }
5237 else
5238 return PyUnicode_FromUnicode(s+i, j-i);
5239}
5240
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241
5242static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005243do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005245 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5246 int len = PyUnicode_GET_SIZE(self), i, j;
5247
5248 i = 0;
5249 if (striptype != RIGHTSTRIP) {
5250 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5251 i++;
5252 }
5253 }
5254
5255 j = len;
5256 if (striptype != LEFTSTRIP) {
5257 do {
5258 j--;
5259 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5260 j++;
5261 }
5262
5263 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5264 Py_INCREF(self);
5265 return (PyObject*)self;
5266 }
5267 else
5268 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269}
5270
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005271
5272static PyObject *
5273do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5274{
5275 PyObject *sep = NULL;
5276
5277 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5278 return NULL;
5279
5280 if (sep != NULL && sep != Py_None) {
5281 if (PyUnicode_Check(sep))
5282 return _PyUnicode_XStrip(self, striptype, sep);
5283 else if (PyString_Check(sep)) {
5284 PyObject *res;
5285 sep = PyUnicode_FromObject(sep);
5286 if (sep==NULL)
5287 return NULL;
5288 res = _PyUnicode_XStrip(self, striptype, sep);
5289 Py_DECREF(sep);
5290 return res;
5291 }
5292 else {
5293 PyErr_Format(PyExc_TypeError,
5294 "%s arg must be None, unicode or str",
5295 STRIPNAME(striptype));
5296 return NULL;
5297 }
5298 }
5299
5300 return do_strip(self, striptype);
5301}
5302
5303
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005304PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005305"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005306\n\
5307Return a copy of the string S with leading and trailing\n\
5308whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005309If chars is given and not None, remove characters in chars instead.\n\
5310If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005311
5312static PyObject *
5313unicode_strip(PyUnicodeObject *self, PyObject *args)
5314{
5315 if (PyTuple_GET_SIZE(args) == 0)
5316 return do_strip(self, BOTHSTRIP); /* Common case */
5317 else
5318 return do_argstrip(self, BOTHSTRIP, args);
5319}
5320
5321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005322PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005323"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005324\n\
5325Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005326If chars is given and not None, remove characters in chars instead.\n\
5327If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005328
5329static PyObject *
5330unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5331{
5332 if (PyTuple_GET_SIZE(args) == 0)
5333 return do_strip(self, LEFTSTRIP); /* Common case */
5334 else
5335 return do_argstrip(self, LEFTSTRIP, args);
5336}
5337
5338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005339PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005340"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005341\n\
5342Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005343If chars is given and not None, remove characters in chars instead.\n\
5344If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005345
5346static PyObject *
5347unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5348{
5349 if (PyTuple_GET_SIZE(args) == 0)
5350 return do_strip(self, RIGHTSTRIP); /* Common case */
5351 else
5352 return do_argstrip(self, RIGHTSTRIP, args);
5353}
5354
5355
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356static PyObject*
5357unicode_repeat(PyUnicodeObject *str, int len)
5358{
5359 PyUnicodeObject *u;
5360 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005361 int nchars;
5362 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363
5364 if (len < 0)
5365 len = 0;
5366
Tim Peters7a29bd52001-09-12 03:03:31 +00005367 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 /* no repeat, return original string */
5369 Py_INCREF(str);
5370 return (PyObject*) str;
5371 }
Tim Peters8f422462000-09-09 06:13:41 +00005372
5373 /* ensure # of chars needed doesn't overflow int and # of bytes
5374 * needed doesn't overflow size_t
5375 */
5376 nchars = len * str->length;
5377 if (len && nchars / len != str->length) {
5378 PyErr_SetString(PyExc_OverflowError,
5379 "repeated string is too long");
5380 return NULL;
5381 }
5382 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5383 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5384 PyErr_SetString(PyExc_OverflowError,
5385 "repeated string is too long");
5386 return NULL;
5387 }
5388 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389 if (!u)
5390 return NULL;
5391
5392 p = u->str;
5393
5394 while (len-- > 0) {
5395 Py_UNICODE_COPY(p, str->str, str->length);
5396 p += str->length;
5397 }
5398
5399 return (PyObject*) u;
5400}
5401
5402PyObject *PyUnicode_Replace(PyObject *obj,
5403 PyObject *subobj,
5404 PyObject *replobj,
5405 int maxcount)
5406{
5407 PyObject *self;
5408 PyObject *str1;
5409 PyObject *str2;
5410 PyObject *result;
5411
5412 self = PyUnicode_FromObject(obj);
5413 if (self == NULL)
5414 return NULL;
5415 str1 = PyUnicode_FromObject(subobj);
5416 if (str1 == NULL) {
5417 Py_DECREF(self);
5418 return NULL;
5419 }
5420 str2 = PyUnicode_FromObject(replobj);
5421 if (str2 == NULL) {
5422 Py_DECREF(self);
5423 Py_DECREF(str1);
5424 return NULL;
5425 }
5426 result = replace((PyUnicodeObject *)self,
5427 (PyUnicodeObject *)str1,
5428 (PyUnicodeObject *)str2,
5429 maxcount);
5430 Py_DECREF(self);
5431 Py_DECREF(str1);
5432 Py_DECREF(str2);
5433 return result;
5434}
5435
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005436PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437"S.replace (old, new[, maxsplit]) -> unicode\n\
5438\n\
5439Return a copy of S with all occurrences of substring\n\
5440old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005441given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442
5443static PyObject*
5444unicode_replace(PyUnicodeObject *self, PyObject *args)
5445{
5446 PyUnicodeObject *str1;
5447 PyUnicodeObject *str2;
5448 int maxcount = -1;
5449 PyObject *result;
5450
5451 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5452 return NULL;
5453 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5454 if (str1 == NULL)
5455 return NULL;
5456 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005457 if (str2 == NULL) {
5458 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005460 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461
5462 result = replace(self, str1, str2, maxcount);
5463
5464 Py_DECREF(str1);
5465 Py_DECREF(str2);
5466 return result;
5467}
5468
5469static
5470PyObject *unicode_repr(PyObject *unicode)
5471{
5472 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5473 PyUnicode_GET_SIZE(unicode),
5474 1);
5475}
5476
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005477PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478"S.rfind(sub [,start [,end]]) -> int\n\
5479\n\
5480Return the highest index in S where substring sub is found,\n\
5481such that sub is contained within s[start,end]. Optional\n\
5482arguments start and end are interpreted as in slice notation.\n\
5483\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005484Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485
5486static PyObject *
5487unicode_rfind(PyUnicodeObject *self, PyObject *args)
5488{
5489 PyUnicodeObject *substring;
5490 int start = 0;
5491 int end = INT_MAX;
5492 PyObject *result;
5493
Guido van Rossumb8872e62000-05-09 14:14:27 +00005494 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5495 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496 return NULL;
5497 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5498 (PyObject *)substring);
5499 if (substring == NULL)
5500 return NULL;
5501
5502 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5503
5504 Py_DECREF(substring);
5505 return result;
5506}
5507
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005508PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509"S.rindex(sub [,start [,end]]) -> int\n\
5510\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005511Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512
5513static PyObject *
5514unicode_rindex(PyUnicodeObject *self, PyObject *args)
5515{
5516 int result;
5517 PyUnicodeObject *substring;
5518 int start = 0;
5519 int end = INT_MAX;
5520
Guido van Rossumb8872e62000-05-09 14:14:27 +00005521 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5522 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 return NULL;
5524 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5525 (PyObject *)substring);
5526 if (substring == NULL)
5527 return NULL;
5528
5529 result = findstring(self, substring, start, end, -1);
5530
5531 Py_DECREF(substring);
5532 if (result < 0) {
5533 PyErr_SetString(PyExc_ValueError, "substring not found");
5534 return NULL;
5535 }
5536 return PyInt_FromLong(result);
5537}
5538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005539PyDoc_STRVAR(rjust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540"S.rjust(width) -> unicode\n\
5541\n\
5542Return S right justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005543done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544
5545static PyObject *
5546unicode_rjust(PyUnicodeObject *self, PyObject *args)
5547{
5548 int width;
5549 if (!PyArg_ParseTuple(args, "i:rjust", &width))
5550 return NULL;
5551
Tim Peters7a29bd52001-09-12 03:03:31 +00005552 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553 Py_INCREF(self);
5554 return (PyObject*) self;
5555 }
5556
5557 return (PyObject*) pad(self, width - self->length, 0, ' ');
5558}
5559
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560static PyObject*
5561unicode_slice(PyUnicodeObject *self, int start, int end)
5562{
5563 /* standard clamping */
5564 if (start < 0)
5565 start = 0;
5566 if (end < 0)
5567 end = 0;
5568 if (end > self->length)
5569 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005570 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 /* full slice, return original string */
5572 Py_INCREF(self);
5573 return (PyObject*) self;
5574 }
5575 if (start > end)
5576 start = end;
5577 /* copy slice */
5578 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5579 end - start);
5580}
5581
5582PyObject *PyUnicode_Split(PyObject *s,
5583 PyObject *sep,
5584 int maxsplit)
5585{
5586 PyObject *result;
5587
5588 s = PyUnicode_FromObject(s);
5589 if (s == NULL)
5590 return NULL;
5591 if (sep != NULL) {
5592 sep = PyUnicode_FromObject(sep);
5593 if (sep == NULL) {
5594 Py_DECREF(s);
5595 return NULL;
5596 }
5597 }
5598
5599 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5600
5601 Py_DECREF(s);
5602 Py_XDECREF(sep);
5603 return result;
5604}
5605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005606PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607"S.split([sep [,maxsplit]]) -> list of strings\n\
5608\n\
5609Return a list of the words in S, using sep as the\n\
5610delimiter string. If maxsplit is given, at most maxsplit\n\
5611splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005612is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613
5614static PyObject*
5615unicode_split(PyUnicodeObject *self, PyObject *args)
5616{
5617 PyObject *substring = Py_None;
5618 int maxcount = -1;
5619
5620 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5621 return NULL;
5622
5623 if (substring == Py_None)
5624 return split(self, NULL, maxcount);
5625 else if (PyUnicode_Check(substring))
5626 return split(self, (PyUnicodeObject *)substring, maxcount);
5627 else
5628 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5629}
5630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005631PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00005632"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633\n\
5634Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00005635Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005636is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637
5638static PyObject*
5639unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5640{
Guido van Rossum86662912000-04-11 15:38:46 +00005641 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642
Guido van Rossum86662912000-04-11 15:38:46 +00005643 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 return NULL;
5645
Guido van Rossum86662912000-04-11 15:38:46 +00005646 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647}
5648
5649static
5650PyObject *unicode_str(PyUnicodeObject *self)
5651{
Fred Drakee4315f52000-05-09 19:53:39 +00005652 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653}
5654
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005655PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656"S.swapcase() -> unicode\n\
5657\n\
5658Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005659and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660
5661static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005662unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664 return fixup(self, fixswapcase);
5665}
5666
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005667PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668"S.translate(table) -> unicode\n\
5669\n\
5670Return a copy of the string S, where all characters have been mapped\n\
5671through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00005672Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5673Unmapped characters are left untouched. Characters mapped to None\n\
5674are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675
5676static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005677unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679 return PyUnicode_TranslateCharmap(self->str,
5680 self->length,
5681 table,
5682 "ignore");
5683}
5684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005685PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686"S.upper() -> unicode\n\
5687\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005688Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689
5690static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005691unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 return fixup(self, fixupper);
5694}
5695
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005696PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697"S.zfill(width) -> unicode\n\
5698\n\
5699Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005700of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701
5702static PyObject *
5703unicode_zfill(PyUnicodeObject *self, PyObject *args)
5704{
5705 int fill;
5706 PyUnicodeObject *u;
5707
5708 int width;
5709 if (!PyArg_ParseTuple(args, "i:zfill", &width))
5710 return NULL;
5711
5712 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00005713 if (PyUnicode_CheckExact(self)) {
5714 Py_INCREF(self);
5715 return (PyObject*) self;
5716 }
5717 else
5718 return PyUnicode_FromUnicode(
5719 PyUnicode_AS_UNICODE(self),
5720 PyUnicode_GET_SIZE(self)
5721 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722 }
5723
5724 fill = width - self->length;
5725
5726 u = pad(self, fill, 0, '0');
5727
Walter Dörwald068325e2002-04-15 13:36:47 +00005728 if (u == NULL)
5729 return NULL;
5730
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731 if (u->str[fill] == '+' || u->str[fill] == '-') {
5732 /* move sign to beginning of string */
5733 u->str[0] = u->str[fill];
5734 u->str[fill] = '0';
5735 }
5736
5737 return (PyObject*) u;
5738}
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739
5740#if 0
5741static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005742unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744 return PyInt_FromLong(unicode_freelist_size);
5745}
5746#endif
5747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005748PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005749"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00005751Return True if S starts with the specified prefix, False otherwise.\n\
5752With optional start, test S beginning at that position.\n\
5753With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754
5755static PyObject *
5756unicode_startswith(PyUnicodeObject *self,
5757 PyObject *args)
5758{
5759 PyUnicodeObject *substring;
5760 int start = 0;
5761 int end = INT_MAX;
5762 PyObject *result;
5763
Guido van Rossumb8872e62000-05-09 14:14:27 +00005764 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5765 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 return NULL;
5767 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5768 (PyObject *)substring);
5769 if (substring == NULL)
5770 return NULL;
5771
Guido van Rossum77f6a652002-04-03 22:41:51 +00005772 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773
5774 Py_DECREF(substring);
5775 return result;
5776}
5777
5778
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005779PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005780"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00005782Return True if S ends with the specified suffix, False otherwise.\n\
5783With optional start, test S beginning at that position.\n\
5784With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785
5786static PyObject *
5787unicode_endswith(PyUnicodeObject *self,
5788 PyObject *args)
5789{
5790 PyUnicodeObject *substring;
5791 int start = 0;
5792 int end = INT_MAX;
5793 PyObject *result;
5794
Guido van Rossumb8872e62000-05-09 14:14:27 +00005795 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5796 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797 return NULL;
5798 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5799 (PyObject *)substring);
5800 if (substring == NULL)
5801 return NULL;
5802
Guido van Rossum77f6a652002-04-03 22:41:51 +00005803 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804
5805 Py_DECREF(substring);
5806 return result;
5807}
5808
5809
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005810
5811static PyObject *
5812unicode_getnewargs(PyUnicodeObject *v)
5813{
5814 return Py_BuildValue("(u#)", v->str, v->length);
5815}
5816
5817
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818static PyMethodDef unicode_methods[] = {
5819
5820 /* Order is according to common usage: often used methods should
5821 appear first, since lookup is done sequentially. */
5822
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005823 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5824 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5825 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5826 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5827 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5828 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5829 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5830 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5831 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5832 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5833 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5834 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5835 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005836 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005837/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5838 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5839 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5840 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005841 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005842 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005843 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005844 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5845 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5846 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5847 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5848 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5849 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5850 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5851 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5852 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5853 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5854 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5855 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5856 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5857 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005858 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00005859#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005860 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861#endif
5862
5863#if 0
5864 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005865 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866#endif
5867
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005868 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 {NULL, NULL}
5870};
5871
Neil Schemenauerce30bc92002-11-18 16:10:18 +00005872static PyObject *
5873unicode_mod(PyObject *v, PyObject *w)
5874{
5875 if (!PyUnicode_Check(v)) {
5876 Py_INCREF(Py_NotImplemented);
5877 return Py_NotImplemented;
5878 }
5879 return PyUnicode_Format(v, w);
5880}
5881
5882static PyNumberMethods unicode_as_number = {
5883 0, /*nb_add*/
5884 0, /*nb_subtract*/
5885 0, /*nb_multiply*/
5886 0, /*nb_divide*/
5887 unicode_mod, /*nb_remainder*/
5888};
5889
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890static PySequenceMethods unicode_as_sequence = {
5891 (inquiry) unicode_length, /* sq_length */
5892 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5893 (intargfunc) unicode_repeat, /* sq_repeat */
5894 (intargfunc) unicode_getitem, /* sq_item */
5895 (intintargfunc) unicode_slice, /* sq_slice */
5896 0, /* sq_ass_item */
5897 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005898 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899};
5900
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005901static PyObject*
5902unicode_subscript(PyUnicodeObject* self, PyObject* item)
5903{
5904 if (PyInt_Check(item)) {
5905 long i = PyInt_AS_LONG(item);
5906 if (i < 0)
5907 i += PyString_GET_SIZE(self);
5908 return unicode_getitem(self, i);
5909 } else if (PyLong_Check(item)) {
5910 long i = PyLong_AsLong(item);
5911 if (i == -1 && PyErr_Occurred())
5912 return NULL;
5913 if (i < 0)
5914 i += PyString_GET_SIZE(self);
5915 return unicode_getitem(self, i);
5916 } else if (PySlice_Check(item)) {
5917 int start, stop, step, slicelength, cur, i;
5918 Py_UNICODE* source_buf;
5919 Py_UNICODE* result_buf;
5920 PyObject* result;
5921
5922 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5923 &start, &stop, &step, &slicelength) < 0) {
5924 return NULL;
5925 }
5926
5927 if (slicelength <= 0) {
5928 return PyUnicode_FromUnicode(NULL, 0);
5929 } else {
5930 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5931 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5932
5933 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5934 result_buf[i] = source_buf[cur];
5935 }
5936
5937 result = PyUnicode_FromUnicode(result_buf, slicelength);
5938 PyMem_FREE(result_buf);
5939 return result;
5940 }
5941 } else {
5942 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5943 return NULL;
5944 }
5945}
5946
5947static PyMappingMethods unicode_as_mapping = {
5948 (inquiry)unicode_length, /* mp_length */
5949 (binaryfunc)unicode_subscript, /* mp_subscript */
5950 (objobjargproc)0, /* mp_ass_subscript */
5951};
5952
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953static int
5954unicode_buffer_getreadbuf(PyUnicodeObject *self,
5955 int index,
5956 const void **ptr)
5957{
5958 if (index != 0) {
5959 PyErr_SetString(PyExc_SystemError,
5960 "accessing non-existent unicode segment");
5961 return -1;
5962 }
5963 *ptr = (void *) self->str;
5964 return PyUnicode_GET_DATA_SIZE(self);
5965}
5966
5967static int
5968unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5969 const void **ptr)
5970{
5971 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00005972 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 return -1;
5974}
5975
5976static int
5977unicode_buffer_getsegcount(PyUnicodeObject *self,
5978 int *lenp)
5979{
5980 if (lenp)
5981 *lenp = PyUnicode_GET_DATA_SIZE(self);
5982 return 1;
5983}
5984
5985static int
5986unicode_buffer_getcharbuf(PyUnicodeObject *self,
5987 int index,
5988 const void **ptr)
5989{
5990 PyObject *str;
5991
5992 if (index != 0) {
5993 PyErr_SetString(PyExc_SystemError,
5994 "accessing non-existent unicode segment");
5995 return -1;
5996 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005997 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 if (str == NULL)
5999 return -1;
6000 *ptr = (void *) PyString_AS_STRING(str);
6001 return PyString_GET_SIZE(str);
6002}
6003
6004/* Helpers for PyUnicode_Format() */
6005
6006static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006007getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008{
6009 int argidx = *p_argidx;
6010 if (argidx < arglen) {
6011 (*p_argidx)++;
6012 if (arglen < 0)
6013 return args;
6014 else
6015 return PyTuple_GetItem(args, argidx);
6016 }
6017 PyErr_SetString(PyExc_TypeError,
6018 "not enough arguments for format string");
6019 return NULL;
6020}
6021
6022#define F_LJUST (1<<0)
6023#define F_SIGN (1<<1)
6024#define F_BLANK (1<<2)
6025#define F_ALT (1<<3)
6026#define F_ZERO (1<<4)
6027
6028static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030{
6031 register int i;
6032 int len;
6033 va_list va;
6034 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036
6037 /* First, format the string as char array, then expand to Py_UNICODE
6038 array. */
6039 charbuffer = (char *)buffer;
6040 len = vsprintf(charbuffer, format, va);
6041 for (i = len - 1; i >= 0; i--)
6042 buffer[i] = (Py_UNICODE) charbuffer[i];
6043
6044 va_end(va);
6045 return len;
6046}
6047
Guido van Rossum078151d2002-08-11 04:24:12 +00006048/* XXX To save some code duplication, formatfloat/long/int could have been
6049 shared with stringobject.c, converting from 8-bit to Unicode after the
6050 formatting is done. */
6051
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052static int
6053formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006054 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 int flags,
6056 int prec,
6057 int type,
6058 PyObject *v)
6059{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006060 /* fmt = '%#.' + `prec` + `type`
6061 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 char fmt[20];
6063 double x;
6064
6065 x = PyFloat_AsDouble(v);
6066 if (x == -1.0 && PyErr_Occurred())
6067 return -1;
6068 if (prec < 0)
6069 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6071 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006072 /* Worst case length calc to ensure no buffer overrun:
6073
6074 'g' formats:
6075 fmt = %#.<prec>g
6076 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6077 for any double rep.)
6078 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6079
6080 'f' formats:
6081 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6082 len = 1 + 50 + 1 + prec = 52 + prec
6083
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006084 If prec=0 the effective precision is 1 (the leading digit is
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006085 always given), therefore increase the length by one.
6086
6087 */
6088 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6089 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006090 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006091 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006092 return -1;
6093 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006094 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6095 (flags&F_ALT) ? "#" : "",
6096 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 return usprintf(buf, fmt, x);
6098}
6099
Tim Peters38fd5b62000-09-21 05:43:11 +00006100static PyObject*
6101formatlong(PyObject *val, int flags, int prec, int type)
6102{
6103 char *buf;
6104 int i, len;
6105 PyObject *str; /* temporary string object. */
6106 PyUnicodeObject *result;
6107
6108 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6109 if (!str)
6110 return NULL;
6111 result = _PyUnicode_New(len);
6112 for (i = 0; i < len; i++)
6113 result->str[i] = buf[i];
6114 result->str[len] = 0;
6115 Py_DECREF(str);
6116 return (PyObject*)result;
6117}
6118
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119static int
6120formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006121 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 int flags,
6123 int prec,
6124 int type,
6125 PyObject *v)
6126{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006127 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006128 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6129 * + 1 + 1
6130 * = 24
6131 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006132 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 long x;
6134
6135 x = PyInt_AsLong(v);
6136 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006137 return -1;
Guido van Rossum078151d2002-08-11 04:24:12 +00006138 if (x < 0 && type != 'd' && type != 'i') {
Guido van Rossum54df53a2002-08-14 18:38:27 +00006139 if (PyErr_Warn(PyExc_FutureWarning,
Guido van Rossum078151d2002-08-11 04:24:12 +00006140 "%u/%o/%x/%X of negative int will return "
6141 "a signed string in Python 2.4 and up") < 0)
6142 return -1;
6143 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006145 prec = 1;
6146
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006147 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006148 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
6149 */
6150 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006151 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006152 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006153 return -1;
6154 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006155
6156 if ((flags & F_ALT) &&
6157 (type == 'x' || type == 'X')) {
6158 /* When converting under %#x or %#X, there are a number
6159 * of issues that cause pain:
6160 * - when 0 is being converted, the C standard leaves off
6161 * the '0x' or '0X', which is inconsistent with other
6162 * %#x/%#X conversions and inconsistent with Python's
6163 * hex() function
6164 * - there are platforms that violate the standard and
6165 * convert 0 with the '0x' or '0X'
6166 * (Metrowerks, Compaq Tru64)
6167 * - there are platforms that give '0x' when converting
6168 * under %#X, but convert 0 in accordance with the
6169 * standard (OS/2 EMX)
6170 *
6171 * We can achieve the desired consistency by inserting our
6172 * own '0x' or '0X' prefix, and substituting %x/%X in place
6173 * of %#x/%#X.
6174 *
6175 * Note that this is the same approach as used in
6176 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006177 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006178 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
6179 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006180 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006181 else {
6182 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
6183 (flags&F_ALT) ? "#" : "",
6184 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006185 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 return usprintf(buf, fmt, x);
6187}
6188
6189static int
6190formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006191 size_t buflen,
6192 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006194 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006195 if (PyUnicode_Check(v)) {
6196 if (PyUnicode_GET_SIZE(v) != 1)
6197 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006201 else if (PyString_Check(v)) {
6202 if (PyString_GET_SIZE(v) != 1)
6203 goto onError;
6204 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206
6207 else {
6208 /* Integer input truncated to a character */
6209 long x;
6210 x = PyInt_AsLong(v);
6211 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006212 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006213#ifdef Py_UNICODE_WIDE
6214 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006215 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006216 "%c arg not in range(0x110000) "
6217 "(wide Python build)");
6218 return -1;
6219 }
6220#else
6221 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006222 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006223 "%c arg not in range(0x10000) "
6224 "(narrow Python build)");
6225 return -1;
6226 }
6227#endif
6228 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 }
6230 buf[1] = '\0';
6231 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006232
6233 onError:
6234 PyErr_SetString(PyExc_TypeError,
6235 "%c requires int or char");
6236 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237}
6238
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006239/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6240
6241 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6242 chars are formatted. XXX This is a magic number. Each formatting
6243 routine does bounds checking to ensure no overflow, but a better
6244 solution may be to malloc a buffer of appropriate size for each
6245 format. For now, the current solution is sufficient.
6246*/
6247#define FORMATBUFLEN (size_t)120
6248
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249PyObject *PyUnicode_Format(PyObject *format,
6250 PyObject *args)
6251{
6252 Py_UNICODE *fmt, *res;
6253 int fmtcnt, rescnt, reslen, arglen, argidx;
6254 int args_owned = 0;
6255 PyUnicodeObject *result = NULL;
6256 PyObject *dict = NULL;
6257 PyObject *uformat;
6258
6259 if (format == NULL || args == NULL) {
6260 PyErr_BadInternalCall();
6261 return NULL;
6262 }
6263 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006264 if (uformat == NULL)
6265 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266 fmt = PyUnicode_AS_UNICODE(uformat);
6267 fmtcnt = PyUnicode_GET_SIZE(uformat);
6268
6269 reslen = rescnt = fmtcnt + 100;
6270 result = _PyUnicode_New(reslen);
6271 if (result == NULL)
6272 goto onError;
6273 res = PyUnicode_AS_UNICODE(result);
6274
6275 if (PyTuple_Check(args)) {
6276 arglen = PyTuple_Size(args);
6277 argidx = 0;
6278 }
6279 else {
6280 arglen = -1;
6281 argidx = -2;
6282 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006283 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6284 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285 dict = args;
6286
6287 while (--fmtcnt >= 0) {
6288 if (*fmt != '%') {
6289 if (--rescnt < 0) {
6290 rescnt = fmtcnt + 100;
6291 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006292 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293 return NULL;
6294 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6295 --rescnt;
6296 }
6297 *res++ = *fmt++;
6298 }
6299 else {
6300 /* Got a format specifier */
6301 int flags = 0;
6302 int width = -1;
6303 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 Py_UNICODE c = '\0';
6305 Py_UNICODE fill;
6306 PyObject *v = NULL;
6307 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006308 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 Py_UNICODE sign;
6310 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006311 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312
6313 fmt++;
6314 if (*fmt == '(') {
6315 Py_UNICODE *keystart;
6316 int keylen;
6317 PyObject *key;
6318 int pcount = 1;
6319
6320 if (dict == NULL) {
6321 PyErr_SetString(PyExc_TypeError,
6322 "format requires a mapping");
6323 goto onError;
6324 }
6325 ++fmt;
6326 --fmtcnt;
6327 keystart = fmt;
6328 /* Skip over balanced parentheses */
6329 while (pcount > 0 && --fmtcnt >= 0) {
6330 if (*fmt == ')')
6331 --pcount;
6332 else if (*fmt == '(')
6333 ++pcount;
6334 fmt++;
6335 }
6336 keylen = fmt - keystart - 1;
6337 if (fmtcnt < 0 || pcount > 0) {
6338 PyErr_SetString(PyExc_ValueError,
6339 "incomplete format key");
6340 goto onError;
6341 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006342#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006343 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344 then looked up since Python uses strings to hold
6345 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006346 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347 key = PyUnicode_EncodeUTF8(keystart,
6348 keylen,
6349 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006350#else
6351 key = PyUnicode_FromUnicode(keystart, keylen);
6352#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353 if (key == NULL)
6354 goto onError;
6355 if (args_owned) {
6356 Py_DECREF(args);
6357 args_owned = 0;
6358 }
6359 args = PyObject_GetItem(dict, key);
6360 Py_DECREF(key);
6361 if (args == NULL) {
6362 goto onError;
6363 }
6364 args_owned = 1;
6365 arglen = -1;
6366 argidx = -2;
6367 }
6368 while (--fmtcnt >= 0) {
6369 switch (c = *fmt++) {
6370 case '-': flags |= F_LJUST; continue;
6371 case '+': flags |= F_SIGN; continue;
6372 case ' ': flags |= F_BLANK; continue;
6373 case '#': flags |= F_ALT; continue;
6374 case '0': flags |= F_ZERO; continue;
6375 }
6376 break;
6377 }
6378 if (c == '*') {
6379 v = getnextarg(args, arglen, &argidx);
6380 if (v == NULL)
6381 goto onError;
6382 if (!PyInt_Check(v)) {
6383 PyErr_SetString(PyExc_TypeError,
6384 "* wants int");
6385 goto onError;
6386 }
6387 width = PyInt_AsLong(v);
6388 if (width < 0) {
6389 flags |= F_LJUST;
6390 width = -width;
6391 }
6392 if (--fmtcnt >= 0)
6393 c = *fmt++;
6394 }
6395 else if (c >= '0' && c <= '9') {
6396 width = c - '0';
6397 while (--fmtcnt >= 0) {
6398 c = *fmt++;
6399 if (c < '0' || c > '9')
6400 break;
6401 if ((width*10) / 10 != width) {
6402 PyErr_SetString(PyExc_ValueError,
6403 "width too big");
6404 goto onError;
6405 }
6406 width = width*10 + (c - '0');
6407 }
6408 }
6409 if (c == '.') {
6410 prec = 0;
6411 if (--fmtcnt >= 0)
6412 c = *fmt++;
6413 if (c == '*') {
6414 v = getnextarg(args, arglen, &argidx);
6415 if (v == NULL)
6416 goto onError;
6417 if (!PyInt_Check(v)) {
6418 PyErr_SetString(PyExc_TypeError,
6419 "* wants int");
6420 goto onError;
6421 }
6422 prec = PyInt_AsLong(v);
6423 if (prec < 0)
6424 prec = 0;
6425 if (--fmtcnt >= 0)
6426 c = *fmt++;
6427 }
6428 else if (c >= '0' && c <= '9') {
6429 prec = c - '0';
6430 while (--fmtcnt >= 0) {
6431 c = Py_CHARMASK(*fmt++);
6432 if (c < '0' || c > '9')
6433 break;
6434 if ((prec*10) / 10 != prec) {
6435 PyErr_SetString(PyExc_ValueError,
6436 "prec too big");
6437 goto onError;
6438 }
6439 prec = prec*10 + (c - '0');
6440 }
6441 }
6442 } /* prec */
6443 if (fmtcnt >= 0) {
6444 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 if (--fmtcnt >= 0)
6446 c = *fmt++;
6447 }
6448 }
6449 if (fmtcnt < 0) {
6450 PyErr_SetString(PyExc_ValueError,
6451 "incomplete format");
6452 goto onError;
6453 }
6454 if (c != '%') {
6455 v = getnextarg(args, arglen, &argidx);
6456 if (v == NULL)
6457 goto onError;
6458 }
6459 sign = 0;
6460 fill = ' ';
6461 switch (c) {
6462
6463 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006464 pbuf = formatbuf;
6465 /* presume that buffer length is at least 1 */
6466 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 len = 1;
6468 break;
6469
6470 case 's':
6471 case 'r':
6472 if (PyUnicode_Check(v) && c == 's') {
6473 temp = v;
6474 Py_INCREF(temp);
6475 }
6476 else {
6477 PyObject *unicode;
6478 if (c == 's')
6479 temp = PyObject_Str(v);
6480 else
6481 temp = PyObject_Repr(v);
6482 if (temp == NULL)
6483 goto onError;
6484 if (!PyString_Check(temp)) {
6485 /* XXX Note: this should never happen, since
6486 PyObject_Repr() and PyObject_Str() assure
6487 this */
6488 Py_DECREF(temp);
6489 PyErr_SetString(PyExc_TypeError,
6490 "%s argument has non-string str()");
6491 goto onError;
6492 }
Fred Drakee4315f52000-05-09 19:53:39 +00006493 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006495 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496 "strict");
6497 Py_DECREF(temp);
6498 temp = unicode;
6499 if (temp == NULL)
6500 goto onError;
6501 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006502 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 len = PyUnicode_GET_SIZE(temp);
6504 if (prec >= 0 && len > prec)
6505 len = prec;
6506 break;
6507
6508 case 'i':
6509 case 'd':
6510 case 'u':
6511 case 'o':
6512 case 'x':
6513 case 'X':
6514 if (c == 'i')
6515 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006516 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006517 temp = formatlong(v, flags, prec, c);
6518 if (!temp)
6519 goto onError;
6520 pbuf = PyUnicode_AS_UNICODE(temp);
6521 len = PyUnicode_GET_SIZE(temp);
6522 /* unbounded ints can always produce
6523 a sign character! */
6524 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006526 else {
6527 pbuf = formatbuf;
6528 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6529 flags, prec, c, v);
6530 if (len < 0)
6531 goto onError;
6532 /* only d conversion is signed */
6533 sign = c == 'd';
6534 }
6535 if (flags & F_ZERO)
6536 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537 break;
6538
6539 case 'e':
6540 case 'E':
6541 case 'f':
6542 case 'g':
6543 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006544 pbuf = formatbuf;
6545 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6546 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 if (len < 0)
6548 goto onError;
6549 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006550 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 fill = '0';
6552 break;
6553
6554 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006555 pbuf = formatbuf;
6556 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557 if (len < 0)
6558 goto onError;
6559 break;
6560
6561 default:
6562 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006563 "unsupported format character '%c' (0x%x) "
6564 "at index %i",
Neal Norwitza0378e12002-09-13 13:47:06 +00006565 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006566 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006567 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568 goto onError;
6569 }
6570 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006571 if (*pbuf == '-' || *pbuf == '+') {
6572 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573 len--;
6574 }
6575 else if (flags & F_SIGN)
6576 sign = '+';
6577 else if (flags & F_BLANK)
6578 sign = ' ';
6579 else
6580 sign = 0;
6581 }
6582 if (width < len)
6583 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006584 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 reslen -= rescnt;
6586 rescnt = width + fmtcnt + 100;
6587 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006588 if (reslen < 0) {
6589 Py_DECREF(result);
6590 return PyErr_NoMemory();
6591 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006592 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593 return NULL;
6594 res = PyUnicode_AS_UNICODE(result)
6595 + reslen - rescnt;
6596 }
6597 if (sign) {
6598 if (fill != ' ')
6599 *res++ = sign;
6600 rescnt--;
6601 if (width > len)
6602 width--;
6603 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006604 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6605 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006606 assert(pbuf[1] == c);
6607 if (fill != ' ') {
6608 *res++ = *pbuf++;
6609 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00006610 }
Tim Petersfff53252001-04-12 18:38:48 +00006611 rescnt -= 2;
6612 width -= 2;
6613 if (width < 0)
6614 width = 0;
6615 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00006616 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 if (width > len && !(flags & F_LJUST)) {
6618 do {
6619 --rescnt;
6620 *res++ = fill;
6621 } while (--width > len);
6622 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006623 if (fill == ' ') {
6624 if (sign)
6625 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00006626 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006627 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006628 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00006629 *res++ = *pbuf++;
6630 *res++ = *pbuf++;
6631 }
6632 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006633 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 res += len;
6635 rescnt -= len;
6636 while (--width >= len) {
6637 --rescnt;
6638 *res++ = ' ';
6639 }
6640 if (dict && (argidx < arglen) && c != '%') {
6641 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006642 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643 goto onError;
6644 }
6645 Py_XDECREF(temp);
6646 } /* '%' */
6647 } /* until end */
6648 if (argidx < arglen && !dict) {
6649 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006650 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 goto onError;
6652 }
6653
6654 if (args_owned) {
6655 Py_DECREF(args);
6656 }
6657 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006658 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006659 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 return (PyObject *)result;
6661
6662 onError:
6663 Py_XDECREF(result);
6664 Py_DECREF(uformat);
6665 if (args_owned) {
6666 Py_DECREF(args);
6667 }
6668 return NULL;
6669}
6670
6671static PyBufferProcs unicode_as_buffer = {
6672 (getreadbufferproc) unicode_buffer_getreadbuf,
6673 (getwritebufferproc) unicode_buffer_getwritebuf,
6674 (getsegcountproc) unicode_buffer_getsegcount,
6675 (getcharbufferproc) unicode_buffer_getcharbuf,
6676};
6677
Jeremy Hylton938ace62002-07-17 16:30:39 +00006678static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00006679unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6680
Tim Peters6d6c1a32001-08-02 04:15:00 +00006681static PyObject *
6682unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6683{
6684 PyObject *x = NULL;
6685 static char *kwlist[] = {"string", "encoding", "errors", 0};
6686 char *encoding = NULL;
6687 char *errors = NULL;
6688
Guido van Rossume023fe02001-08-30 03:12:59 +00006689 if (type != &PyUnicode_Type)
6690 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00006691 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6692 kwlist, &x, &encoding, &errors))
6693 return NULL;
6694 if (x == NULL)
6695 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00006696 if (encoding == NULL && errors == NULL)
6697 return PyObject_Unicode(x);
6698 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00006699 return PyUnicode_FromEncodedObject(x, encoding, errors);
6700}
6701
Guido van Rossume023fe02001-08-30 03:12:59 +00006702static PyObject *
6703unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6704{
Tim Petersaf90b3e2001-09-12 05:18:58 +00006705 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006706 int n;
6707
6708 assert(PyType_IsSubtype(type, &PyUnicode_Type));
6709 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6710 if (tmp == NULL)
6711 return NULL;
6712 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00006713 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006714 if (pnew == NULL) {
6715 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00006716 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00006717 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006718 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6719 if (pnew->str == NULL) {
6720 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006721 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006722 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00006723 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00006724 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006725 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6726 pnew->length = n;
6727 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00006728 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00006729 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006730}
6731
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006732PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00006733"unicode(string [, encoding[, errors]]) -> object\n\
6734\n\
6735Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00006736encoding defaults to the current default string encoding.\n\
6737errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00006738
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739PyTypeObject PyUnicode_Type = {
6740 PyObject_HEAD_INIT(&PyType_Type)
6741 0, /* ob_size */
6742 "unicode", /* tp_name */
6743 sizeof(PyUnicodeObject), /* tp_size */
6744 0, /* tp_itemsize */
6745 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00006746 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006748 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 0, /* tp_setattr */
6750 (cmpfunc) unicode_compare, /* tp_compare */
6751 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006752 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006754 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755 (hashfunc) unicode_hash, /* tp_hash*/
6756 0, /* tp_call*/
6757 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006758 PyObject_GenericGetAttr, /* tp_getattro */
6759 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006761 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
6762 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006763 unicode_doc, /* tp_doc */
6764 0, /* tp_traverse */
6765 0, /* tp_clear */
6766 0, /* tp_richcompare */
6767 0, /* tp_weaklistoffset */
6768 0, /* tp_iter */
6769 0, /* tp_iternext */
6770 unicode_methods, /* tp_methods */
6771 0, /* tp_members */
6772 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00006773 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006774 0, /* tp_dict */
6775 0, /* tp_descr_get */
6776 0, /* tp_descr_set */
6777 0, /* tp_dictoffset */
6778 0, /* tp_init */
6779 0, /* tp_alloc */
6780 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006781 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782};
6783
6784/* Initialize the Unicode implementation */
6785
Thomas Wouters78890102000-07-22 19:25:51 +00006786void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006788 int i;
6789
Fred Drakee4315f52000-05-09 19:53:39 +00006790 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006791 unicode_freelist = NULL;
6792 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00006794 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006795 for (i = 0; i < 256; i++)
6796 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00006797 if (PyType_Ready(&PyUnicode_Type) < 0)
6798 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799}
6800
6801/* Finalize the Unicode implementation */
6802
6803void
Thomas Wouters78890102000-07-22 19:25:51 +00006804_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006806 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006807 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00006809 Py_XDECREF(unicode_empty);
6810 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006811
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006812 for (i = 0; i < 256; i++) {
6813 if (unicode_latin1[i]) {
6814 Py_DECREF(unicode_latin1[i]);
6815 unicode_latin1[i] = NULL;
6816 }
6817 }
6818
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006819 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 PyUnicodeObject *v = u;
6821 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006822 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00006823 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006824 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006825 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006827 unicode_freelist = NULL;
6828 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006830
6831/*
6832Local variables:
6833c-basic-offset: 4
6834indent-tabs-mode: nil
6835End:
6836*/