blob: b165597273550b4e3e4a89c9a7428a4cce57bc3d [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000222 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000279 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000280 *unicode = (PyObject *)w;
281 return 0;
282 }
283
284 /* Note that we don't have to modify *unicode for unshared Unicode
285 objects, since we can modify them in-place. */
286 return unicode_resize(v, length);
287}
288
289/* Internal API for use in unicodeobject.c only ! */
290#define _PyUnicode_Resize(unicodevar, length) \
291 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
292
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
294 int size)
295{
296 PyUnicodeObject *unicode;
297
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000298 /* If the Unicode data is known at construction time, we can apply
299 some optimizations which share commonly used objects. */
300 if (u != NULL) {
301
302 /* Optimization for empty strings */
303 if (size == 0 && unicode_empty != NULL) {
304 Py_INCREF(unicode_empty);
305 return (PyObject *)unicode_empty;
306 }
307
308 /* Single character Unicode objects in the Latin-1 range are
309 shared when using this constructor */
310 if (size == 1 && *u < 256) {
311 unicode = unicode_latin1[*u];
312 if (!unicode) {
313 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000314 if (!unicode)
315 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000316 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000317 unicode_latin1[*u] = unicode;
318 }
319 Py_INCREF(unicode);
320 return (PyObject *)unicode;
321 }
322 }
323
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 unicode = _PyUnicode_New(size);
325 if (!unicode)
326 return NULL;
327
328 /* Copy the Unicode data into the new object */
329 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331
332 return (PyObject *)unicode;
333}
334
335#ifdef HAVE_WCHAR_H
336
337PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
338 int size)
339{
340 PyUnicodeObject *unicode;
341
342 if (w == NULL) {
343 PyErr_BadInternalCall();
344 return NULL;
345 }
346
347 unicode = _PyUnicode_New(size);
348 if (!unicode)
349 return NULL;
350
351 /* Copy the wchar_t data into the new object */
352#ifdef HAVE_USABLE_WCHAR_T
353 memcpy(unicode->str, w, size * sizeof(wchar_t));
354#else
355 {
356 register Py_UNICODE *u;
357 register int i;
358 u = PyUnicode_AS_UNICODE(unicode);
359 for (i = size; i >= 0; i--)
360 *u++ = *w++;
361 }
362#endif
363
364 return (PyObject *)unicode;
365}
366
367int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
368 register wchar_t *w,
369 int size)
370{
371 if (unicode == NULL) {
372 PyErr_BadInternalCall();
373 return -1;
374 }
375 if (size > PyUnicode_GET_SIZE(unicode))
376 size = PyUnicode_GET_SIZE(unicode);
377#ifdef HAVE_USABLE_WCHAR_T
378 memcpy(w, unicode->str, size * sizeof(wchar_t));
379#else
380 {
381 register Py_UNICODE *u;
382 register int i;
383 u = PyUnicode_AS_UNICODE(unicode);
384 for (i = size; i >= 0; i--)
385 *w++ = *u++;
386 }
387#endif
388
389 return size;
390}
391
392#endif
393
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000394PyObject *PyUnicode_FromOrdinal(int ordinal)
395{
396 Py_UNICODE s[2];
397
398#ifdef Py_UNICODE_WIDE
399 if (ordinal < 0 || ordinal > 0x10ffff) {
400 PyErr_SetString(PyExc_ValueError,
401 "unichr() arg not in range(0x110000) "
402 "(wide Python build)");
403 return NULL;
404 }
405#else
406 if (ordinal < 0 || ordinal > 0xffff) {
407 PyErr_SetString(PyExc_ValueError,
408 "unichr() arg not in range(0x10000) "
409 "(narrow Python build)");
410 return NULL;
411 }
412#endif
413
414 if (ordinal <= 0xffff) {
415 /* UCS-2 character */
416 s[0] = (Py_UNICODE) ordinal;
417 return PyUnicode_FromUnicode(s, 1);
418 }
419 else {
420#ifndef Py_UNICODE_WIDE
421 /* UCS-4 character. store as two surrogate characters */
422 ordinal -= 0x10000L;
423 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
424 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
425 return PyUnicode_FromUnicode(s, 2);
426#else
427 s[0] = (Py_UNICODE)ordinal;
428 return PyUnicode_FromUnicode(s, 1);
429#endif
430 }
431}
432
Guido van Rossumd57fd912000-03-10 22:53:23 +0000433PyObject *PyUnicode_FromObject(register PyObject *obj)
434{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000435 /* XXX Perhaps we should make this API an alias of
436 PyObject_Unicode() instead ?! */
437 if (PyUnicode_CheckExact(obj)) {
438 Py_INCREF(obj);
439 return obj;
440 }
441 if (PyUnicode_Check(obj)) {
442 /* For a Unicode subtype that's not a Unicode object,
443 return a true Unicode object with the same data. */
444 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
445 PyUnicode_GET_SIZE(obj));
446 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
448}
449
450PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
451 const char *encoding,
452 const char *errors)
453{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000454 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000456 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457
458 if (obj == NULL) {
459 PyErr_BadInternalCall();
460 return NULL;
461 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000462
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000463#if 0
464 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000465 that no encodings is given and then redirect to
466 PyObject_Unicode() which then applies the additional logic for
467 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000468
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000469 NOTE: This API should really only be used for object which
470 represent *encoded* Unicode !
471
472 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000473 if (PyUnicode_Check(obj)) {
474 if (encoding) {
475 PyErr_SetString(PyExc_TypeError,
476 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000477 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000478 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000479 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000480 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000481#else
482 if (PyUnicode_Check(obj)) {
483 PyErr_SetString(PyExc_TypeError,
484 "decoding Unicode is not supported");
485 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000486 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000487#endif
488
489 /* Coerce object */
490 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000491 s = PyString_AS_STRING(obj);
492 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000493 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000494 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
495 /* Overwrite the error message with something more useful in
496 case of a TypeError. */
497 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000498 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000499 "coercing to Unicode: need string or buffer, "
500 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000501 obj->ob_type->tp_name);
502 goto onError;
503 }
504
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000505 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 if (len == 0) {
507 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000510 else
511 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000512
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000513 return v;
514
515 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000517}
518
519PyObject *PyUnicode_Decode(const char *s,
520 int size,
521 const char *encoding,
522 const char *errors)
523{
524 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000525
526 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000527 encoding = PyUnicode_GetDefaultEncoding();
528
529 /* Shortcuts for common default encodings */
530 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000531 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000532 else if (strcmp(encoding, "latin-1") == 0)
533 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000534#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
535 else if (strcmp(encoding, "mbcs") == 0)
536 return PyUnicode_DecodeMBCS(s, size, errors);
537#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000538 else if (strcmp(encoding, "ascii") == 0)
539 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000540
541 /* Decode via the codec registry */
542 buffer = PyBuffer_FromMemory((void *)s, size);
543 if (buffer == NULL)
544 goto onError;
545 unicode = PyCodec_Decode(buffer, encoding, errors);
546 if (unicode == NULL)
547 goto onError;
548 if (!PyUnicode_Check(unicode)) {
549 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000550 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000551 unicode->ob_type->tp_name);
552 Py_DECREF(unicode);
553 goto onError;
554 }
555 Py_DECREF(buffer);
556 return unicode;
557
558 onError:
559 Py_XDECREF(buffer);
560 return NULL;
561}
562
563PyObject *PyUnicode_Encode(const Py_UNICODE *s,
564 int size,
565 const char *encoding,
566 const char *errors)
567{
568 PyObject *v, *unicode;
569
570 unicode = PyUnicode_FromUnicode(s, size);
571 if (unicode == NULL)
572 return NULL;
573 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
574 Py_DECREF(unicode);
575 return v;
576}
577
578PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
579 const char *encoding,
580 const char *errors)
581{
582 PyObject *v;
583
584 if (!PyUnicode_Check(unicode)) {
585 PyErr_BadArgument();
586 goto onError;
587 }
Fred Drakee4315f52000-05-09 19:53:39 +0000588
589 if (encoding == NULL)
590 encoding = PyUnicode_GetDefaultEncoding();
591
592 /* Shortcuts for common default encodings */
593 if (errors == NULL) {
594 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000595 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000596 else if (strcmp(encoding, "latin-1") == 0)
597 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000598#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
599 else if (strcmp(encoding, "mbcs") == 0)
600 return PyUnicode_AsMBCSString(unicode);
601#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000602 else if (strcmp(encoding, "ascii") == 0)
603 return PyUnicode_AsASCIIString(unicode);
604 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000605
606 /* Encode via the codec registry */
607 v = PyCodec_Encode(unicode, encoding, errors);
608 if (v == NULL)
609 goto onError;
610 /* XXX Should we really enforce this ? */
611 if (!PyString_Check(v)) {
612 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000613 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 v->ob_type->tp_name);
615 Py_DECREF(v);
616 goto onError;
617 }
618 return v;
619
620 onError:
621 return NULL;
622}
623
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000624PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
625 const char *errors)
626{
627 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
628
629 if (v)
630 return v;
631 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
632 if (v && errors == NULL)
633 ((PyUnicodeObject *)unicode)->defenc = v;
634 return v;
635}
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
638{
639 if (!PyUnicode_Check(unicode)) {
640 PyErr_BadArgument();
641 goto onError;
642 }
643 return PyUnicode_AS_UNICODE(unicode);
644
645 onError:
646 return NULL;
647}
648
649int PyUnicode_GetSize(PyObject *unicode)
650{
651 if (!PyUnicode_Check(unicode)) {
652 PyErr_BadArgument();
653 goto onError;
654 }
655 return PyUnicode_GET_SIZE(unicode);
656
657 onError:
658 return -1;
659}
660
Thomas Wouters78890102000-07-22 19:25:51 +0000661const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000662{
663 return unicode_default_encoding;
664}
665
666int PyUnicode_SetDefaultEncoding(const char *encoding)
667{
668 PyObject *v;
669
670 /* Make sure the encoding is valid. As side effect, this also
671 loads the encoding into the codec registry cache. */
672 v = _PyCodec_Lookup(encoding);
673 if (v == NULL)
674 goto onError;
675 Py_DECREF(v);
676 strncpy(unicode_default_encoding,
677 encoding,
678 sizeof(unicode_default_encoding));
679 return 0;
680
681 onError:
682 return -1;
683}
684
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000685/* error handling callback helper:
686 build arguments, call the callback and check the arguments,
687 if no exception occured, copy the replacement to the output
688 and adjust various state variables.
689 return 0 on success, -1 on error
690*/
691
692static
693int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
694 const char *encoding, const char *reason,
695 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
696 PyObject **output, int *outpos, Py_UNICODE **outptr)
697{
698 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
699
700 PyObject *restuple = NULL;
701 PyObject *repunicode = NULL;
702 int outsize = PyUnicode_GET_SIZE(*output);
703 int requiredsize;
704 int newpos;
705 Py_UNICODE *repptr;
706 int repsize;
707 int res = -1;
708
709 if (*errorHandler == NULL) {
710 *errorHandler = PyCodec_LookupError(errors);
711 if (*errorHandler == NULL)
712 goto onError;
713 }
714
715 if (*exceptionObject == NULL) {
716 *exceptionObject = PyUnicodeDecodeError_Create(
717 encoding, input, insize, *startinpos, *endinpos, reason);
718 if (*exceptionObject == NULL)
719 goto onError;
720 }
721 else {
722 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
723 goto onError;
724 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
725 goto onError;
726 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
727 goto onError;
728 }
729
730 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
731 if (restuple == NULL)
732 goto onError;
733 if (!PyTuple_Check(restuple)) {
734 PyErr_Format(PyExc_TypeError, &argparse[4]);
735 goto onError;
736 }
737 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
738 goto onError;
739 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000740 newpos = insize+newpos;
741 if (newpos<0 || newpos>insize) {
742 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
743 goto onError;
744 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000745
746 /* need more space? (at least enough for what we
747 have+the replacement+the rest of the string (starting
748 at the new input position), so we won't have to check space
749 when there are no errors in the rest of the string) */
750 repptr = PyUnicode_AS_UNICODE(repunicode);
751 repsize = PyUnicode_GET_SIZE(repunicode);
752 requiredsize = *outpos + repsize + insize-newpos;
753 if (requiredsize > outsize) {
754 if (requiredsize<2*outsize)
755 requiredsize = 2*outsize;
756 if (PyUnicode_Resize(output, requiredsize))
757 goto onError;
758 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
759 }
760 *endinpos = newpos;
761 *inptr = input + newpos;
762 Py_UNICODE_COPY(*outptr, repptr, repsize);
763 *outptr += repsize;
764 *outpos += repsize;
765 /* we made it! */
766 res = 0;
767
768 onError:
769 Py_XDECREF(restuple);
770 return res;
771}
772
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000773/* --- UTF-7 Codec -------------------------------------------------------- */
774
775/* see RFC2152 for details */
776
777static
778char utf7_special[128] = {
779 /* indicate whether a UTF-7 character is special i.e. cannot be directly
780 encoded:
781 0 - not special
782 1 - special
783 2 - whitespace (optional)
784 3 - RFC2152 Set O (optional) */
785 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
786 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
787 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
788 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
789 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
790 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
791 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
792 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
793
794};
795
796#define SPECIAL(c, encodeO, encodeWS) \
797 (((c)>127 || utf7_special[(c)] == 1) || \
798 (encodeWS && (utf7_special[(c)] == 2)) || \
799 (encodeO && (utf7_special[(c)] == 3)))
800
801#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
802#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
803#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
804 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
805
806#define ENCODE(out, ch, bits) \
807 while (bits >= 6) { \
808 *out++ = B64(ch >> (bits-6)); \
809 bits -= 6; \
810 }
811
812#define DECODE(out, ch, bits, surrogate) \
813 while (bits >= 16) { \
814 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
815 bits -= 16; \
816 if (surrogate) { \
817 /* We have already generated an error for the high surrogate
818 so let's not bother seeing if the low surrogate is correct or not */\
819 surrogate = 0; \
820 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
821 /* This is a surrogate pair. Unfortunately we can't represent \
822 it in a 16-bit character */ \
823 surrogate = 1; \
824 errmsg = "code pairs are not supported"; \
825 goto utf7Error; \
826 } else { \
827 *out++ = outCh; \
828 } \
829 } \
830
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000831PyObject *PyUnicode_DecodeUTF7(const char *s,
832 int size,
833 const char *errors)
834{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000835 const char *starts = s;
836 int startinpos;
837 int endinpos;
838 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000839 const char *e;
840 PyUnicodeObject *unicode;
841 Py_UNICODE *p;
842 const char *errmsg = "";
843 int inShift = 0;
844 unsigned int bitsleft = 0;
845 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000846 int surrogate = 0;
847 PyObject *errorHandler = NULL;
848 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000849
850 unicode = _PyUnicode_New(size);
851 if (!unicode)
852 return NULL;
853 if (size == 0)
854 return (PyObject *)unicode;
855
856 p = unicode->str;
857 e = s + size;
858
859 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000860 Py_UNICODE ch;
861 restart:
862 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000863
864 if (inShift) {
865 if ((ch == '-') || !B64CHAR(ch)) {
866 inShift = 0;
867 s++;
868
869 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
870 if (bitsleft >= 6) {
871 /* The shift sequence has a partial character in it. If
872 bitsleft < 6 then we could just classify it as padding
873 but that is not the case here */
874
875 errmsg = "partial character in shift sequence";
876 goto utf7Error;
877 }
878 /* According to RFC2152 the remaining bits should be zero. We
879 choose to signal an error/insert a replacement character
880 here so indicate the potential of a misencoded character. */
881
882 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
883 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
884 errmsg = "non-zero padding bits in shift sequence";
885 goto utf7Error;
886 }
887
888 if (ch == '-') {
889 if ((s < e) && (*(s) == '-')) {
890 *p++ = '-';
891 inShift = 1;
892 }
893 } else if (SPECIAL(ch,0,0)) {
894 errmsg = "unexpected special character";
895 goto utf7Error;
896 } else {
897 *p++ = ch;
898 }
899 } else {
900 charsleft = (charsleft << 6) | UB64(ch);
901 bitsleft += 6;
902 s++;
903 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
904 }
905 }
906 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000907 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000908 s++;
909 if (s < e && *s == '-') {
910 s++;
911 *p++ = '+';
912 } else
913 {
914 inShift = 1;
915 bitsleft = 0;
916 }
917 }
918 else if (SPECIAL(ch,0,0)) {
919 errmsg = "unexpected special character";
920 s++;
921 goto utf7Error;
922 }
923 else {
924 *p++ = ch;
925 s++;
926 }
927 continue;
928 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000929 outpos = p-PyUnicode_AS_UNICODE(unicode);
930 endinpos = s-starts;
931 if (unicode_decode_call_errorhandler(
932 errors, &errorHandler,
933 "utf7", errmsg,
934 starts, size, &startinpos, &endinpos, &exc, &s,
935 (PyObject **)&unicode, &outpos, &p))
936 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000937 }
938
939 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000940 outpos = p-PyUnicode_AS_UNICODE(unicode);
941 endinpos = size;
942 if (unicode_decode_call_errorhandler(
943 errors, &errorHandler,
944 "utf7", "unterminated shift sequence",
945 starts, size, &startinpos, &endinpos, &exc, &s,
946 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000947 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000948 if (s < e)
949 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950 }
951
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000952 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000953 goto onError;
954
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000955 Py_XDECREF(errorHandler);
956 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000957 return (PyObject *)unicode;
958
959onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000960 Py_XDECREF(errorHandler);
961 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000962 Py_DECREF(unicode);
963 return NULL;
964}
965
966
967PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
968 int size,
969 int encodeSetO,
970 int encodeWhiteSpace,
971 const char *errors)
972{
973 PyObject *v;
974 /* It might be possible to tighten this worst case */
975 unsigned int cbAllocated = 5 * size;
976 int inShift = 0;
977 int i = 0;
978 unsigned int bitsleft = 0;
979 unsigned long charsleft = 0;
980 char * out;
981 char * start;
982
983 if (size == 0)
984 return PyString_FromStringAndSize(NULL, 0);
985
986 v = PyString_FromStringAndSize(NULL, cbAllocated);
987 if (v == NULL)
988 return NULL;
989
990 start = out = PyString_AS_STRING(v);
991 for (;i < size; ++i) {
992 Py_UNICODE ch = s[i];
993
994 if (!inShift) {
995 if (ch == '+') {
996 *out++ = '+';
997 *out++ = '-';
998 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
999 charsleft = ch;
1000 bitsleft = 16;
1001 *out++ = '+';
1002 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1003 inShift = bitsleft > 0;
1004 } else {
1005 *out++ = (char) ch;
1006 }
1007 } else {
1008 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1009 *out++ = B64(charsleft << (6-bitsleft));
1010 charsleft = 0;
1011 bitsleft = 0;
1012 /* Characters not in the BASE64 set implicitly unshift the sequence
1013 so no '-' is required, except if the character is itself a '-' */
1014 if (B64CHAR(ch) || ch == '-') {
1015 *out++ = '-';
1016 }
1017 inShift = 0;
1018 *out++ = (char) ch;
1019 } else {
1020 bitsleft += 16;
1021 charsleft = (charsleft << 16) | ch;
1022 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1023
1024 /* If the next character is special then we dont' need to terminate
1025 the shift sequence. If the next character is not a BASE64 character
1026 or '-' then the shift sequence will be terminated implicitly and we
1027 don't have to insert a '-'. */
1028
1029 if (bitsleft == 0) {
1030 if (i + 1 < size) {
1031 Py_UNICODE ch2 = s[i+1];
1032
1033 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1034
1035 } else if (B64CHAR(ch2) || ch2 == '-') {
1036 *out++ = '-';
1037 inShift = 0;
1038 } else {
1039 inShift = 0;
1040 }
1041
1042 }
1043 else {
1044 *out++ = '-';
1045 inShift = 0;
1046 }
1047 }
1048 }
1049 }
1050 }
1051 if (bitsleft) {
1052 *out++= B64(charsleft << (6-bitsleft) );
1053 *out++ = '-';
1054 }
1055
Tim Peters5de98422002-04-27 18:44:32 +00001056 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001057 return v;
1058}
1059
1060#undef SPECIAL
1061#undef B64
1062#undef B64CHAR
1063#undef UB64
1064#undef ENCODE
1065#undef DECODE
1066
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067/* --- UTF-8 Codec -------------------------------------------------------- */
1068
1069static
1070char utf8_code_length[256] = {
1071 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1072 illegal prefix. see RFC 2279 for details */
1073 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1074 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1075 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1076 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1077 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1078 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1079 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1080 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1081 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1082 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1083 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1084 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1085 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1086 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1087 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1088 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1089};
1090
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091PyObject *PyUnicode_DecodeUTF8(const char *s,
1092 int size,
1093 const char *errors)
1094{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001095 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001097 int startinpos;
1098 int endinpos;
1099 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100 const char *e;
1101 PyUnicodeObject *unicode;
1102 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001103 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001104 PyObject *errorHandler = NULL;
1105 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001106
1107 /* Note: size will always be longer than the resulting Unicode
1108 character count */
1109 unicode = _PyUnicode_New(size);
1110 if (!unicode)
1111 return NULL;
1112 if (size == 0)
1113 return (PyObject *)unicode;
1114
1115 /* Unpack UTF-8 encoded data */
1116 p = unicode->str;
1117 e = s + size;
1118
1119 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001120 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121
1122 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001123 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001124 s++;
1125 continue;
1126 }
1127
1128 n = utf8_code_length[ch];
1129
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001130 if (s + n > e) {
1131 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001132 startinpos = s-starts;
1133 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001134 goto utf8Error;
1135 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001136
1137 switch (n) {
1138
1139 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001140 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001141 startinpos = s-starts;
1142 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001143 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144
1145 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001146 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001147 startinpos = s-starts;
1148 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001149 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150
1151 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001152 if ((s[1] & 0xc0) != 0x80) {
1153 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001154 startinpos = s-starts;
1155 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001156 goto utf8Error;
1157 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001159 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001160 startinpos = s-starts;
1161 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001162 errmsg = "illegal encoding";
1163 goto utf8Error;
1164 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001166 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 break;
1168
1169 case 3:
1170 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001171 (s[2] & 0xc0) != 0x80) {
1172 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001173 startinpos = s-starts;
1174 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001175 goto utf8Error;
1176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001178 if (ch < 0x0800) {
1179 /* Note: UTF-8 encodings of surrogates are considered
1180 legal UTF-8 sequences;
1181
1182 XXX For wide builds (UCS-4) we should probably try
1183 to recombine the surrogates into a single code
1184 unit.
1185 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001186 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001187 startinpos = s-starts;
1188 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001189 goto utf8Error;
1190 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001192 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001193 break;
1194
1195 case 4:
1196 if ((s[1] & 0xc0) != 0x80 ||
1197 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001198 (s[3] & 0xc0) != 0x80) {
1199 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001200 startinpos = s-starts;
1201 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001202 goto utf8Error;
1203 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001204 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1205 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1206 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001207 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001208 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001209 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001210 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001211 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001212 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001213 startinpos = s-starts;
1214 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001215 goto utf8Error;
1216 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001217#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001218 *p++ = (Py_UNICODE)ch;
1219#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001220 /* compute and append the two surrogates: */
1221
1222 /* translate from 10000..10FFFF to 0..FFFF */
1223 ch -= 0x10000;
1224
1225 /* high surrogate = top 10 bits added to D800 */
1226 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1227
1228 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001229 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001230#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 break;
1232
1233 default:
1234 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001235 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001236 startinpos = s-starts;
1237 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001238 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 }
1240 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001241 continue;
1242
1243 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001244 outpos = p-PyUnicode_AS_UNICODE(unicode);
1245 if (unicode_decode_call_errorhandler(
1246 errors, &errorHandler,
1247 "utf8", errmsg,
1248 starts, size, &startinpos, &endinpos, &exc, &s,
1249 (PyObject **)&unicode, &outpos, &p))
1250 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 }
1252
1253 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001254 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255 goto onError;
1256
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001257 Py_XDECREF(errorHandler);
1258 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001259 return (PyObject *)unicode;
1260
1261onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001262 Py_XDECREF(errorHandler);
1263 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 Py_DECREF(unicode);
1265 return NULL;
1266}
1267
Tim Peters602f7402002-04-27 18:03:26 +00001268/* Allocation strategy: if the string is short, convert into a stack buffer
1269 and allocate exactly as much space needed at the end. Else allocate the
1270 maximum possible needed (4 result bytes per Unicode character), and return
1271 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001272*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001273PyObject *
1274PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1275 int size,
1276 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277{
Tim Peters602f7402002-04-27 18:03:26 +00001278#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001279
Tim Peters602f7402002-04-27 18:03:26 +00001280 int i; /* index into s of next input byte */
1281 PyObject *v; /* result string object */
1282 char *p; /* next free byte in output buffer */
1283 int nallocated; /* number of result bytes allocated */
1284 int nneeded; /* number of result bytes needed */
1285 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001286
Tim Peters602f7402002-04-27 18:03:26 +00001287 assert(s != NULL);
1288 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289
Tim Peters602f7402002-04-27 18:03:26 +00001290 if (size <= MAX_SHORT_UNICHARS) {
1291 /* Write into the stack buffer; nallocated can't overflow.
1292 * At the end, we'll allocate exactly as much heap space as it
1293 * turns out we need.
1294 */
1295 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1296 v = NULL; /* will allocate after we're done */
1297 p = stackbuf;
1298 }
1299 else {
1300 /* Overallocate on the heap, and give the excess back at the end. */
1301 nallocated = size * 4;
1302 if (nallocated / 4 != size) /* overflow! */
1303 return PyErr_NoMemory();
1304 v = PyString_FromStringAndSize(NULL, nallocated);
1305 if (v == NULL)
1306 return NULL;
1307 p = PyString_AS_STRING(v);
1308 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001309
Tim Peters602f7402002-04-27 18:03:26 +00001310 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001311 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001312
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001313 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001314 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001315 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001316
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001318 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001319 *p++ = (char)(0xc0 | (ch >> 6));
1320 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001321 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001322 else {
Tim Peters602f7402002-04-27 18:03:26 +00001323 /* Encode UCS2 Unicode ordinals */
1324 if (ch < 0x10000) {
1325 /* Special case: check for high surrogate */
1326 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1327 Py_UCS4 ch2 = s[i];
1328 /* Check for low surrogate and combine the two to
1329 form a UCS4 value */
1330 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001331 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001332 i++;
1333 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001334 }
Tim Peters602f7402002-04-27 18:03:26 +00001335 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001336 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001337 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001338 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1339 *p++ = (char)(0x80 | (ch & 0x3f));
1340 continue;
1341 }
1342encodeUCS4:
1343 /* Encode UCS4 Unicode ordinals */
1344 *p++ = (char)(0xf0 | (ch >> 18));
1345 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1346 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1347 *p++ = (char)(0x80 | (ch & 0x3f));
1348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001350
Tim Peters602f7402002-04-27 18:03:26 +00001351 if (v == NULL) {
1352 /* This was stack allocated. */
1353 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1354 assert(nneeded <= nallocated);
1355 v = PyString_FromStringAndSize(stackbuf, nneeded);
1356 }
1357 else {
1358 /* Cut back to size actually needed. */
1359 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1360 assert(nneeded <= nallocated);
1361 _PyString_Resize(&v, nneeded);
1362 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001363 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001364
Tim Peters602f7402002-04-27 18:03:26 +00001365#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366}
1367
Guido van Rossumd57fd912000-03-10 22:53:23 +00001368PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1369{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370 if (!PyUnicode_Check(unicode)) {
1371 PyErr_BadArgument();
1372 return NULL;
1373 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001374 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1375 PyUnicode_GET_SIZE(unicode),
1376 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377}
1378
1379/* --- UTF-16 Codec ------------------------------------------------------- */
1380
Tim Peters772747b2001-08-09 22:21:55 +00001381PyObject *
1382PyUnicode_DecodeUTF16(const char *s,
1383 int size,
1384 const char *errors,
1385 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001386{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001387 const char *starts = s;
1388 int startinpos;
1389 int endinpos;
1390 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001391 PyUnicodeObject *unicode;
1392 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001393 const unsigned char *q, *e;
1394 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001395 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001396 /* Offsets from q for retrieving byte pairs in the right order. */
1397#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1398 int ihi = 1, ilo = 0;
1399#else
1400 int ihi = 0, ilo = 1;
1401#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001402 PyObject *errorHandler = NULL;
1403 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404
1405 /* Note: size will always be longer than the resulting Unicode
1406 character count */
1407 unicode = _PyUnicode_New(size);
1408 if (!unicode)
1409 return NULL;
1410 if (size == 0)
1411 return (PyObject *)unicode;
1412
1413 /* Unpack UTF-16 encoded data */
1414 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001415 q = (unsigned char *)s;
1416 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417
1418 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001419 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001420
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001421 /* Check for BOM marks (U+FEFF) in the input and adjust current
1422 byte order setting accordingly. In native mode, the leading BOM
1423 mark is skipped, in all other modes, it is copied to the output
1424 stream as-is (giving a ZWNBSP character). */
1425 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001426 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001427#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001428 if (bom == 0xFEFF) {
1429 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001430 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001431 }
1432 else if (bom == 0xFFFE) {
1433 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001434 bo = 1;
1435 }
1436#else
Tim Peters772747b2001-08-09 22:21:55 +00001437 if (bom == 0xFEFF) {
1438 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001439 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001440 }
1441 else if (bom == 0xFFFE) {
1442 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001443 bo = -1;
1444 }
1445#endif
1446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447
Tim Peters772747b2001-08-09 22:21:55 +00001448 if (bo == -1) {
1449 /* force LE */
1450 ihi = 1;
1451 ilo = 0;
1452 }
1453 else if (bo == 1) {
1454 /* force BE */
1455 ihi = 0;
1456 ilo = 1;
1457 }
1458
1459 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001460 Py_UNICODE ch;
1461 /* remaing bytes at the end? (size should be even) */
1462 if (e-q<2) {
1463 errmsg = "truncated data";
1464 startinpos = ((const char *)q)-starts;
1465 endinpos = ((const char *)e)-starts;
1466 goto utf16Error;
1467 /* The remaining input chars are ignored if the callback
1468 chooses to skip the input */
1469 }
1470 ch = (q[ihi] << 8) | q[ilo];
1471
Tim Peters772747b2001-08-09 22:21:55 +00001472 q += 2;
1473
Guido van Rossumd57fd912000-03-10 22:53:23 +00001474 if (ch < 0xD800 || ch > 0xDFFF) {
1475 *p++ = ch;
1476 continue;
1477 }
1478
1479 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001480 if (q >= e) {
1481 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001482 startinpos = (((const char *)q)-2)-starts;
1483 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001484 goto utf16Error;
1485 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001486 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001487 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1488 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001489 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001490#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001491 *p++ = ch;
1492 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001493#else
1494 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001495#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001496 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001497 }
1498 else {
1499 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001500 startinpos = (((const char *)q)-4)-starts;
1501 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001502 goto utf16Error;
1503 }
1504
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001506 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001507 startinpos = (((const char *)q)-2)-starts;
1508 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001509 /* Fall through to report the error */
1510
1511 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001512 outpos = p-PyUnicode_AS_UNICODE(unicode);
1513 if (unicode_decode_call_errorhandler(
1514 errors, &errorHandler,
1515 "utf16", errmsg,
1516 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1517 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001518 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001519 }
1520
1521 if (byteorder)
1522 *byteorder = bo;
1523
1524 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001525 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 goto onError;
1527
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001528 Py_XDECREF(errorHandler);
1529 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001530 return (PyObject *)unicode;
1531
1532onError:
1533 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001534 Py_XDECREF(errorHandler);
1535 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001536 return NULL;
1537}
1538
Tim Peters772747b2001-08-09 22:21:55 +00001539PyObject *
1540PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1541 int size,
1542 const char *errors,
1543 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001544{
1545 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001546 unsigned char *p;
1547 int i, pairs;
1548 /* Offsets from p for storing byte pairs in the right order. */
1549#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1550 int ihi = 1, ilo = 0;
1551#else
1552 int ihi = 0, ilo = 1;
1553#endif
1554
1555#define STORECHAR(CH) \
1556 do { \
1557 p[ihi] = ((CH) >> 8) & 0xff; \
1558 p[ilo] = (CH) & 0xff; \
1559 p += 2; \
1560 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001562 for (i = pairs = 0; i < size; i++)
1563 if (s[i] >= 0x10000)
1564 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001566 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001567 if (v == NULL)
1568 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569
Tim Peters772747b2001-08-09 22:21:55 +00001570 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001571 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001572 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001573 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001574 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001575
1576 if (byteorder == -1) {
1577 /* force LE */
1578 ihi = 1;
1579 ilo = 0;
1580 }
1581 else if (byteorder == 1) {
1582 /* force BE */
1583 ihi = 0;
1584 ilo = 1;
1585 }
1586
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001587 while (size-- > 0) {
1588 Py_UNICODE ch = *s++;
1589 Py_UNICODE ch2 = 0;
1590 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001591 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1592 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593 }
Tim Peters772747b2001-08-09 22:21:55 +00001594 STORECHAR(ch);
1595 if (ch2)
1596 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001597 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001598 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001599#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001600}
1601
1602PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1603{
1604 if (!PyUnicode_Check(unicode)) {
1605 PyErr_BadArgument();
1606 return NULL;
1607 }
1608 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1609 PyUnicode_GET_SIZE(unicode),
1610 NULL,
1611 0);
1612}
1613
1614/* --- Unicode Escape Codec ----------------------------------------------- */
1615
Fredrik Lundh06d12682001-01-24 07:59:11 +00001616static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001617
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1619 int size,
1620 const char *errors)
1621{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001622 const char *starts = s;
1623 int startinpos;
1624 int endinpos;
1625 int outpos;
1626 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001627 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001628 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001630 char* message;
1631 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001632 PyObject *errorHandler = NULL;
1633 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001634
Guido van Rossumd57fd912000-03-10 22:53:23 +00001635 /* Escaped strings will always be longer than the resulting
1636 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001637 length after conversion to the true value.
1638 (but if the error callback returns a long replacement string
1639 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001640 v = _PyUnicode_New(size);
1641 if (v == NULL)
1642 goto onError;
1643 if (size == 0)
1644 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001645
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001646 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001648
Guido van Rossumd57fd912000-03-10 22:53:23 +00001649 while (s < end) {
1650 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001651 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001652 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001653
1654 /* Non-escape characters are interpreted as Unicode ordinals */
1655 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001656 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657 continue;
1658 }
1659
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001660 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 /* \ - Escapes */
1662 s++;
1663 switch (*s++) {
1664
1665 /* \x escapes */
1666 case '\n': break;
1667 case '\\': *p++ = '\\'; break;
1668 case '\'': *p++ = '\''; break;
1669 case '\"': *p++ = '\"'; break;
1670 case 'b': *p++ = '\b'; break;
1671 case 'f': *p++ = '\014'; break; /* FF */
1672 case 't': *p++ = '\t'; break;
1673 case 'n': *p++ = '\n'; break;
1674 case 'r': *p++ = '\r'; break;
1675 case 'v': *p++ = '\013'; break; /* VT */
1676 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1677
1678 /* \OOO (octal) escapes */
1679 case '0': case '1': case '2': case '3':
1680 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001681 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001682 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001683 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001685 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001687 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001688 break;
1689
Fredrik Lundhccc74732001-02-18 22:13:49 +00001690 /* hex escapes */
1691 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001693 digits = 2;
1694 message = "truncated \\xXX escape";
1695 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696
Fredrik Lundhccc74732001-02-18 22:13:49 +00001697 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001699 digits = 4;
1700 message = "truncated \\uXXXX escape";
1701 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702
Fredrik Lundhccc74732001-02-18 22:13:49 +00001703 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001704 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001705 digits = 8;
1706 message = "truncated \\UXXXXXXXX escape";
1707 hexescape:
1708 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001709 outpos = p-PyUnicode_AS_UNICODE(v);
1710 if (s+digits>end) {
1711 endinpos = size;
1712 if (unicode_decode_call_errorhandler(
1713 errors, &errorHandler,
1714 "unicodeescape", "end of string in escape sequence",
1715 starts, size, &startinpos, &endinpos, &exc, &s,
1716 (PyObject **)&v, &outpos, &p))
1717 goto onError;
1718 goto nextByte;
1719 }
1720 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001721 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001722 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001723 endinpos = (s+i+1)-starts;
1724 if (unicode_decode_call_errorhandler(
1725 errors, &errorHandler,
1726 "unicodeescape", message,
1727 starts, size, &startinpos, &endinpos, &exc, &s,
1728 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001729 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001730 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001731 }
1732 chr = (chr<<4) & ~0xF;
1733 if (c >= '0' && c <= '9')
1734 chr += c - '0';
1735 else if (c >= 'a' && c <= 'f')
1736 chr += 10 + c - 'a';
1737 else
1738 chr += 10 + c - 'A';
1739 }
1740 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001741 if (chr == 0xffffffff)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742 /* _decoding_error will have already written into the
1743 target buffer. */
1744 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001745 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001746 /* when we get here, chr is a 32-bit unicode character */
1747 if (chr <= 0xffff)
1748 /* UCS-2 character */
1749 *p++ = (Py_UNICODE) chr;
1750 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001751 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001752 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001753#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001754 *p++ = chr;
1755#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001756 chr -= 0x10000L;
1757 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001758 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001759#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001760 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001761 endinpos = s-starts;
1762 outpos = p-PyUnicode_AS_UNICODE(v);
1763 if (unicode_decode_call_errorhandler(
1764 errors, &errorHandler,
1765 "unicodeescape", "illegal Unicode character",
1766 starts, size, &startinpos, &endinpos, &exc, &s,
1767 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001768 goto onError;
1769 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001770 break;
1771
1772 /* \N{name} */
1773 case 'N':
1774 message = "malformed \\N character escape";
1775 if (ucnhash_CAPI == NULL) {
1776 /* load the unicode data module */
1777 PyObject *m, *v;
1778 m = PyImport_ImportModule("unicodedata");
1779 if (m == NULL)
1780 goto ucnhashError;
1781 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1782 Py_DECREF(m);
1783 if (v == NULL)
1784 goto ucnhashError;
1785 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1786 Py_DECREF(v);
1787 if (ucnhash_CAPI == NULL)
1788 goto ucnhashError;
1789 }
1790 if (*s == '{') {
1791 const char *start = s+1;
1792 /* look for the closing brace */
1793 while (*s != '}' && s < end)
1794 s++;
1795 if (s > start && s < end && *s == '}') {
1796 /* found a name. look it up in the unicode database */
1797 message = "unknown Unicode character name";
1798 s++;
1799 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1800 goto store;
1801 }
1802 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 endinpos = s-starts;
1804 outpos = p-PyUnicode_AS_UNICODE(v);
1805 if (unicode_decode_call_errorhandler(
1806 errors, &errorHandler,
1807 "unicodeescape", message,
1808 starts, size, &startinpos, &endinpos, &exc, &s,
1809 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001810 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001811 break;
1812
1813 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001814 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001815 message = "\\ at end of string";
1816 s--;
1817 endinpos = s-starts;
1818 outpos = p-PyUnicode_AS_UNICODE(v);
1819 if (unicode_decode_call_errorhandler(
1820 errors, &errorHandler,
1821 "unicodeescape", message,
1822 starts, size, &startinpos, &endinpos, &exc, &s,
1823 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001824 goto onError;
1825 }
1826 else {
1827 *p++ = '\\';
1828 *p++ = (unsigned char)s[-1];
1829 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001830 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001831 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001832 nextByte:
1833 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001834 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001835 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
1836 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001838
Fredrik Lundhccc74732001-02-18 22:13:49 +00001839ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001840 PyErr_SetString(
1841 PyExc_UnicodeError,
1842 "\\N escapes not supported (can't load unicodedata module)"
1843 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001844 Py_XDECREF(errorHandler);
1845 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001846 return NULL;
1847
Fredrik Lundhccc74732001-02-18 22:13:49 +00001848onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001850 Py_XDECREF(errorHandler);
1851 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852 return NULL;
1853}
1854
1855/* Return a Unicode-Escape string version of the Unicode object.
1856
1857 If quotes is true, the string is enclosed in u"" or u'' quotes as
1858 appropriate.
1859
1860*/
1861
Barry Warsaw51ac5802000-03-20 16:36:48 +00001862static const Py_UNICODE *findchar(const Py_UNICODE *s,
1863 int size,
1864 Py_UNICODE ch);
1865
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866static
1867PyObject *unicodeescape_string(const Py_UNICODE *s,
1868 int size,
1869 int quotes)
1870{
1871 PyObject *repr;
1872 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001874 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875
1876 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1877 if (repr == NULL)
1878 return NULL;
1879
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001880 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001881
1882 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883 *p++ = 'u';
1884 *p++ = (findchar(s, size, '\'') &&
1885 !findchar(s, size, '"')) ? '"' : '\'';
1886 }
1887 while (size-- > 0) {
1888 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001889
Guido van Rossumd57fd912000-03-10 22:53:23 +00001890 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001891 if (quotes &&
1892 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001893 *p++ = '\\';
1894 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001895 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001896 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001897
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001898#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001899 /* Map 21-bit characters to '\U00xxxxxx' */
1900 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001901 int offset = p - PyString_AS_STRING(repr);
1902
1903 /* Resize the string if necessary */
1904 if (offset + 12 > PyString_GET_SIZE(repr)) {
1905 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001906 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001907 p = PyString_AS_STRING(repr) + offset;
1908 }
1909
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001910 *p++ = '\\';
1911 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001912 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1913 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1914 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1915 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1916 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1917 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1918 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001919 *p++ = hexdigit[ch & 0x0000000F];
1920 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001921 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001922#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001923 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1924 else if (ch >= 0xD800 && ch < 0xDC00) {
1925 Py_UNICODE ch2;
1926 Py_UCS4 ucs;
1927
1928 ch2 = *s++;
1929 size--;
1930 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1931 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1932 *p++ = '\\';
1933 *p++ = 'U';
1934 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1935 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1936 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1937 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1938 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1939 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1940 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1941 *p++ = hexdigit[ucs & 0x0000000F];
1942 continue;
1943 }
1944 /* Fall through: isolated surrogates are copied as-is */
1945 s--;
1946 size++;
1947 }
1948
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001950 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951 *p++ = '\\';
1952 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001953 *p++ = hexdigit[(ch >> 12) & 0x000F];
1954 *p++ = hexdigit[(ch >> 8) & 0x000F];
1955 *p++ = hexdigit[(ch >> 4) & 0x000F];
1956 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001958
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001959 /* Map special whitespace to '\t', \n', '\r' */
1960 else if (ch == '\t') {
1961 *p++ = '\\';
1962 *p++ = 't';
1963 }
1964 else if (ch == '\n') {
1965 *p++ = '\\';
1966 *p++ = 'n';
1967 }
1968 else if (ch == '\r') {
1969 *p++ = '\\';
1970 *p++ = 'r';
1971 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001972
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001973 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001974 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001976 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001977 *p++ = hexdigit[(ch >> 4) & 0x000F];
1978 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001980
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981 /* Copy everything else as-is */
1982 else
1983 *p++ = (char) ch;
1984 }
1985 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001986 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987
1988 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00001989 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990 return repr;
1991}
1992
1993PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1994 int size)
1995{
1996 return unicodeescape_string(s, size, 0);
1997}
1998
1999PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2000{
2001 if (!PyUnicode_Check(unicode)) {
2002 PyErr_BadArgument();
2003 return NULL;
2004 }
2005 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2006 PyUnicode_GET_SIZE(unicode));
2007}
2008
2009/* --- Raw Unicode Escape Codec ------------------------------------------- */
2010
2011PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2012 int size,
2013 const char *errors)
2014{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002015 const char *starts = s;
2016 int startinpos;
2017 int endinpos;
2018 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002020 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 const char *end;
2022 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002023 PyObject *errorHandler = NULL;
2024 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002025
2026 /* Escaped strings will always be longer than the resulting
2027 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002028 length after conversion to the true value. (But decoding error
2029 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002030 v = _PyUnicode_New(size);
2031 if (v == NULL)
2032 goto onError;
2033 if (size == 0)
2034 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002035 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 end = s + size;
2037 while (s < end) {
2038 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002039 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002041 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042
2043 /* Non-escape characters are interpreted as Unicode ordinals */
2044 if (*s != '\\') {
2045 *p++ = (unsigned char)*s++;
2046 continue;
2047 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002048 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049
2050 /* \u-escapes are only interpreted iff the number of leading
2051 backslashes if odd */
2052 bs = s;
2053 for (;s < end;) {
2054 if (*s != '\\')
2055 break;
2056 *p++ = (unsigned char)*s++;
2057 }
2058 if (((s - bs) & 1) == 0 ||
2059 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002060 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061 continue;
2062 }
2063 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002064 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 s++;
2066
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002067 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002068 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002069 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002070 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002072 endinpos = s-starts;
2073 if (unicode_decode_call_errorhandler(
2074 errors, &errorHandler,
2075 "rawunicodeescape", "truncated \\uXXXX",
2076 starts, size, &startinpos, &endinpos, &exc, &s,
2077 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002079 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080 }
2081 x = (x<<4) & ~0xF;
2082 if (c >= '0' && c <= '9')
2083 x += c - '0';
2084 else if (c >= 'a' && c <= 'f')
2085 x += 10 + c - 'a';
2086 else
2087 x += 10 + c - 'A';
2088 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002089#ifndef Py_UNICODE_WIDE
2090 if (x > 0x10000) {
2091 if (unicode_decode_call_errorhandler(
2092 errors, &errorHandler,
2093 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2094 starts, size, &startinpos, &endinpos, &exc, &s,
2095 (PyObject **)&v, &outpos, &p))
2096 goto onError;
2097 }
2098#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002099 *p++ = x;
2100 nextByte:
2101 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002102 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002103 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002104 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002105 Py_XDECREF(errorHandler);
2106 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107 return (PyObject *)v;
2108
2109 onError:
2110 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002111 Py_XDECREF(errorHandler);
2112 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113 return NULL;
2114}
2115
2116PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2117 int size)
2118{
2119 PyObject *repr;
2120 char *p;
2121 char *q;
2122
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002123 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002125#ifdef Py_UNICODE_WIDE
2126 repr = PyString_FromStringAndSize(NULL, 10 * size);
2127#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002129#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 if (repr == NULL)
2131 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002132 if (size == 0)
2133 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134
2135 p = q = PyString_AS_STRING(repr);
2136 while (size-- > 0) {
2137 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002138#ifdef Py_UNICODE_WIDE
2139 /* Map 32-bit characters to '\Uxxxxxxxx' */
2140 if (ch >= 0x10000) {
2141 *p++ = '\\';
2142 *p++ = 'U';
2143 *p++ = hexdigit[(ch >> 28) & 0xf];
2144 *p++ = hexdigit[(ch >> 24) & 0xf];
2145 *p++ = hexdigit[(ch >> 20) & 0xf];
2146 *p++ = hexdigit[(ch >> 16) & 0xf];
2147 *p++ = hexdigit[(ch >> 12) & 0xf];
2148 *p++ = hexdigit[(ch >> 8) & 0xf];
2149 *p++ = hexdigit[(ch >> 4) & 0xf];
2150 *p++ = hexdigit[ch & 15];
2151 }
2152 else
2153#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154 /* Map 16-bit characters to '\uxxxx' */
2155 if (ch >= 256) {
2156 *p++ = '\\';
2157 *p++ = 'u';
2158 *p++ = hexdigit[(ch >> 12) & 0xf];
2159 *p++ = hexdigit[(ch >> 8) & 0xf];
2160 *p++ = hexdigit[(ch >> 4) & 0xf];
2161 *p++ = hexdigit[ch & 15];
2162 }
2163 /* Copy everything else as-is */
2164 else
2165 *p++ = (char) ch;
2166 }
2167 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002168 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169 return repr;
2170}
2171
2172PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2173{
2174 if (!PyUnicode_Check(unicode)) {
2175 PyErr_BadArgument();
2176 return NULL;
2177 }
2178 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2179 PyUnicode_GET_SIZE(unicode));
2180}
2181
2182/* --- Latin-1 Codec ------------------------------------------------------ */
2183
2184PyObject *PyUnicode_DecodeLatin1(const char *s,
2185 int size,
2186 const char *errors)
2187{
2188 PyUnicodeObject *v;
2189 Py_UNICODE *p;
2190
2191 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002192 if (size == 1 && *(unsigned char*)s < 256) {
2193 Py_UNICODE r = *(unsigned char*)s;
2194 return PyUnicode_FromUnicode(&r, 1);
2195 }
2196
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 v = _PyUnicode_New(size);
2198 if (v == NULL)
2199 goto onError;
2200 if (size == 0)
2201 return (PyObject *)v;
2202 p = PyUnicode_AS_UNICODE(v);
2203 while (size-- > 0)
2204 *p++ = (unsigned char)*s++;
2205 return (PyObject *)v;
2206
2207 onError:
2208 Py_XDECREF(v);
2209 return NULL;
2210}
2211
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002212/* create or adjust a UnicodeEncodeError */
2213static void make_encode_exception(PyObject **exceptionObject,
2214 const char *encoding,
2215 const Py_UNICODE *unicode, int size,
2216 int startpos, int endpos,
2217 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002218{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002219 if (*exceptionObject == NULL) {
2220 *exceptionObject = PyUnicodeEncodeError_Create(
2221 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 }
2223 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002224 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2225 goto onError;
2226 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2227 goto onError;
2228 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2229 goto onError;
2230 return;
2231 onError:
2232 Py_DECREF(*exceptionObject);
2233 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234 }
2235}
2236
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002237/* raises a UnicodeEncodeError */
2238static void raise_encode_exception(PyObject **exceptionObject,
2239 const char *encoding,
2240 const Py_UNICODE *unicode, int size,
2241 int startpos, int endpos,
2242 const char *reason)
2243{
2244 make_encode_exception(exceptionObject,
2245 encoding, unicode, size, startpos, endpos, reason);
2246 if (*exceptionObject != NULL)
2247 PyCodec_StrictErrors(*exceptionObject);
2248}
2249
2250/* error handling callback helper:
2251 build arguments, call the callback and check the arguments,
2252 put the result into newpos and return the replacement string, which
2253 has to be freed by the caller */
2254static PyObject *unicode_encode_call_errorhandler(const char *errors,
2255 PyObject **errorHandler,
2256 const char *encoding, const char *reason,
2257 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2258 int startpos, int endpos,
2259 int *newpos)
2260{
2261 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2262
2263 PyObject *restuple;
2264 PyObject *resunicode;
2265
2266 if (*errorHandler == NULL) {
2267 *errorHandler = PyCodec_LookupError(errors);
2268 if (*errorHandler == NULL)
2269 return NULL;
2270 }
2271
2272 make_encode_exception(exceptionObject,
2273 encoding, unicode, size, startpos, endpos, reason);
2274 if (*exceptionObject == NULL)
2275 return NULL;
2276
2277 restuple = PyObject_CallFunctionObjArgs(
2278 *errorHandler, *exceptionObject, NULL);
2279 if (restuple == NULL)
2280 return NULL;
2281 if (!PyTuple_Check(restuple)) {
2282 PyErr_Format(PyExc_TypeError, &argparse[4]);
2283 Py_DECREF(restuple);
2284 return NULL;
2285 }
2286 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2287 &resunicode, newpos)) {
2288 Py_DECREF(restuple);
2289 return NULL;
2290 }
2291 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002292 *newpos = size+*newpos;
2293 if (*newpos<0 || *newpos>size) {
2294 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2295 Py_DECREF(restuple);
2296 return NULL;
2297 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002298 Py_INCREF(resunicode);
2299 Py_DECREF(restuple);
2300 return resunicode;
2301}
2302
2303static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2304 int size,
2305 const char *errors,
2306 int limit)
2307{
2308 /* output object */
2309 PyObject *res;
2310 /* pointers to the beginning and end+1 of input */
2311 const Py_UNICODE *startp = p;
2312 const Py_UNICODE *endp = p + size;
2313 /* pointer to the beginning of the unencodable characters */
2314 /* const Py_UNICODE *badp = NULL; */
2315 /* pointer into the output */
2316 char *str;
2317 /* current output position */
2318 int respos = 0;
2319 int ressize;
2320 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2321 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2322 PyObject *errorHandler = NULL;
2323 PyObject *exc = NULL;
2324 /* the following variable is used for caching string comparisons
2325 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2326 int known_errorHandler = -1;
2327
2328 /* allocate enough for a simple encoding without
2329 replacements, if we need more, we'll resize */
2330 res = PyString_FromStringAndSize(NULL, size);
2331 if (res == NULL)
2332 goto onError;
2333 if (size == 0)
2334 return res;
2335 str = PyString_AS_STRING(res);
2336 ressize = size;
2337
2338 while (p<endp) {
2339 Py_UNICODE c = *p;
2340
2341 /* can we encode this? */
2342 if (c<limit) {
2343 /* no overflow check, because we know that the space is enough */
2344 *str++ = (char)c;
2345 ++p;
2346 }
2347 else {
2348 int unicodepos = p-startp;
2349 int requiredsize;
2350 PyObject *repunicode;
2351 int repsize;
2352 int newpos;
2353 int respos;
2354 Py_UNICODE *uni2;
2355 /* startpos for collecting unencodable chars */
2356 const Py_UNICODE *collstart = p;
2357 const Py_UNICODE *collend = p;
2358 /* find all unecodable characters */
2359 while ((collend < endp) && ((*collend)>=limit))
2360 ++collend;
2361 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2362 if (known_errorHandler==-1) {
2363 if ((errors==NULL) || (!strcmp(errors, "strict")))
2364 known_errorHandler = 1;
2365 else if (!strcmp(errors, "replace"))
2366 known_errorHandler = 2;
2367 else if (!strcmp(errors, "ignore"))
2368 known_errorHandler = 3;
2369 else if (!strcmp(errors, "xmlcharrefreplace"))
2370 known_errorHandler = 4;
2371 else
2372 known_errorHandler = 0;
2373 }
2374 switch (known_errorHandler) {
2375 case 1: /* strict */
2376 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2377 goto onError;
2378 case 2: /* replace */
2379 while (collstart++<collend)
2380 *str++ = '?'; /* fall through */
2381 case 3: /* ignore */
2382 p = collend;
2383 break;
2384 case 4: /* xmlcharrefreplace */
2385 respos = str-PyString_AS_STRING(res);
2386 /* determine replacement size (temporarily (mis)uses p) */
2387 for (p = collstart, repsize = 0; p < collend; ++p) {
2388 if (*p<10)
2389 repsize += 2+1+1;
2390 else if (*p<100)
2391 repsize += 2+2+1;
2392 else if (*p<1000)
2393 repsize += 2+3+1;
2394 else if (*p<10000)
2395 repsize += 2+4+1;
2396 else if (*p<100000)
2397 repsize += 2+5+1;
2398 else if (*p<1000000)
2399 repsize += 2+6+1;
2400 else
2401 repsize += 2+7+1;
2402 }
2403 requiredsize = respos+repsize+(endp-collend);
2404 if (requiredsize > ressize) {
2405 if (requiredsize<2*ressize)
2406 requiredsize = 2*ressize;
2407 if (_PyString_Resize(&res, requiredsize))
2408 goto onError;
2409 str = PyString_AS_STRING(res) + respos;
2410 ressize = requiredsize;
2411 }
2412 /* generate replacement (temporarily (mis)uses p) */
2413 for (p = collstart; p < collend; ++p) {
2414 str += sprintf(str, "&#%d;", (int)*p);
2415 }
2416 p = collend;
2417 break;
2418 default:
2419 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2420 encoding, reason, startp, size, &exc,
2421 collstart-startp, collend-startp, &newpos);
2422 if (repunicode == NULL)
2423 goto onError;
2424 /* need more space? (at least enough for what we
2425 have+the replacement+the rest of the string, so
2426 we won't have to check space for encodable characters) */
2427 respos = str-PyString_AS_STRING(res);
2428 repsize = PyUnicode_GET_SIZE(repunicode);
2429 requiredsize = respos+repsize+(endp-collend);
2430 if (requiredsize > ressize) {
2431 if (requiredsize<2*ressize)
2432 requiredsize = 2*ressize;
2433 if (_PyString_Resize(&res, requiredsize)) {
2434 Py_DECREF(repunicode);
2435 goto onError;
2436 }
2437 str = PyString_AS_STRING(res) + respos;
2438 ressize = requiredsize;
2439 }
2440 /* check if there is anything unencodable in the replacement
2441 and copy it to the output */
2442 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2443 c = *uni2;
2444 if (c >= limit) {
2445 raise_encode_exception(&exc, encoding, startp, size,
2446 unicodepos, unicodepos+1, reason);
2447 Py_DECREF(repunicode);
2448 goto onError;
2449 }
2450 *str = (char)c;
2451 }
2452 p = startp + newpos;
2453 Py_DECREF(repunicode);
2454 }
2455 }
2456 }
2457 /* Resize if we allocated to much */
2458 respos = str-PyString_AS_STRING(res);
2459 if (respos<ressize)
2460 /* If this falls res will be NULL */
2461 _PyString_Resize(&res, respos);
2462 Py_XDECREF(errorHandler);
2463 Py_XDECREF(exc);
2464 return res;
2465
2466 onError:
2467 Py_XDECREF(res);
2468 Py_XDECREF(errorHandler);
2469 Py_XDECREF(exc);
2470 return NULL;
2471}
2472
Guido van Rossumd57fd912000-03-10 22:53:23 +00002473PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2474 int size,
2475 const char *errors)
2476{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002477 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478}
2479
2480PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2481{
2482 if (!PyUnicode_Check(unicode)) {
2483 PyErr_BadArgument();
2484 return NULL;
2485 }
2486 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2487 PyUnicode_GET_SIZE(unicode),
2488 NULL);
2489}
2490
2491/* --- 7-bit ASCII Codec -------------------------------------------------- */
2492
Guido van Rossumd57fd912000-03-10 22:53:23 +00002493PyObject *PyUnicode_DecodeASCII(const char *s,
2494 int size,
2495 const char *errors)
2496{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002497 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498 PyUnicodeObject *v;
2499 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002500 int startinpos;
2501 int endinpos;
2502 int outpos;
2503 const char *e;
2504 PyObject *errorHandler = NULL;
2505 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506
2507 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002508 if (size == 1 && *(unsigned char*)s < 128) {
2509 Py_UNICODE r = *(unsigned char*)s;
2510 return PyUnicode_FromUnicode(&r, 1);
2511 }
2512
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513 v = _PyUnicode_New(size);
2514 if (v == NULL)
2515 goto onError;
2516 if (size == 0)
2517 return (PyObject *)v;
2518 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002519 e = s + size;
2520 while (s < e) {
2521 register unsigned char c = (unsigned char)*s;
2522 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002523 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002524 ++s;
2525 }
2526 else {
2527 startinpos = s-starts;
2528 endinpos = startinpos + 1;
2529 outpos = p-PyUnicode_AS_UNICODE(v);
2530 if (unicode_decode_call_errorhandler(
2531 errors, &errorHandler,
2532 "ascii", "ordinal not in range(128)",
2533 starts, size, &startinpos, &endinpos, &exc, &s,
2534 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002538 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002539 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002540 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002541 Py_XDECREF(errorHandler);
2542 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543 return (PyObject *)v;
2544
2545 onError:
2546 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002547 Py_XDECREF(errorHandler);
2548 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 return NULL;
2550}
2551
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2553 int size,
2554 const char *errors)
2555{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002556 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557}
2558
2559PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2560{
2561 if (!PyUnicode_Check(unicode)) {
2562 PyErr_BadArgument();
2563 return NULL;
2564 }
2565 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2566 PyUnicode_GET_SIZE(unicode),
2567 NULL);
2568}
2569
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002570#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002571
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002572/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002573
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002574PyObject *PyUnicode_DecodeMBCS(const char *s,
2575 int size,
2576 const char *errors)
2577{
2578 PyUnicodeObject *v;
2579 Py_UNICODE *p;
2580
2581 /* First get the size of the result */
2582 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002583 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002584 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2585
2586 v = _PyUnicode_New(usize);
2587 if (v == NULL)
2588 return NULL;
2589 if (usize == 0)
2590 return (PyObject *)v;
2591 p = PyUnicode_AS_UNICODE(v);
2592 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2593 Py_DECREF(v);
2594 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2595 }
2596
2597 return (PyObject *)v;
2598}
2599
2600PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2601 int size,
2602 const char *errors)
2603{
2604 PyObject *repr;
2605 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002606 DWORD mbcssize;
2607
2608 /* If there are no characters, bail now! */
2609 if (size==0)
2610 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002611
2612 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002613 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002614 if (mbcssize==0)
2615 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2616
2617 repr = PyString_FromStringAndSize(NULL, mbcssize);
2618 if (repr == NULL)
2619 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002620 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002621 return repr;
2622
2623 /* Do the conversion */
2624 s = PyString_AS_STRING(repr);
2625 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2626 Py_DECREF(repr);
2627 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2628 }
2629 return repr;
2630}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002631
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002632PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2633{
2634 if (!PyUnicode_Check(unicode)) {
2635 PyErr_BadArgument();
2636 return NULL;
2637 }
2638 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2639 PyUnicode_GET_SIZE(unicode),
2640 NULL);
2641}
2642
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002643#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002644
Guido van Rossumd57fd912000-03-10 22:53:23 +00002645/* --- Character Mapping Codec -------------------------------------------- */
2646
Guido van Rossumd57fd912000-03-10 22:53:23 +00002647PyObject *PyUnicode_DecodeCharmap(const char *s,
2648 int size,
2649 PyObject *mapping,
2650 const char *errors)
2651{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002652 const char *starts = s;
2653 int startinpos;
2654 int endinpos;
2655 int outpos;
2656 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002657 PyUnicodeObject *v;
2658 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002659 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002660 PyObject *errorHandler = NULL;
2661 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662
2663 /* Default to Latin-1 */
2664 if (mapping == NULL)
2665 return PyUnicode_DecodeLatin1(s, size, errors);
2666
2667 v = _PyUnicode_New(size);
2668 if (v == NULL)
2669 goto onError;
2670 if (size == 0)
2671 return (PyObject *)v;
2672 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002673 e = s + size;
2674 while (s < e) {
2675 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676 PyObject *w, *x;
2677
2678 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2679 w = PyInt_FromLong((long)ch);
2680 if (w == NULL)
2681 goto onError;
2682 x = PyObject_GetItem(mapping, w);
2683 Py_DECREF(w);
2684 if (x == NULL) {
2685 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002686 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002687 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002688 x = Py_None;
2689 Py_INCREF(x);
2690 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002691 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692 }
2693
2694 /* Apply mapping */
2695 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002696 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697 if (value < 0 || value > 65535) {
2698 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002699 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002700 Py_DECREF(x);
2701 goto onError;
2702 }
2703 *p++ = (Py_UNICODE)value;
2704 }
2705 else if (x == Py_None) {
2706 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002707 outpos = p-PyUnicode_AS_UNICODE(v);
2708 startinpos = s-starts;
2709 endinpos = startinpos+1;
2710 if (unicode_decode_call_errorhandler(
2711 errors, &errorHandler,
2712 "charmap", "character maps to <undefined>",
2713 starts, size, &startinpos, &endinpos, &exc, &s,
2714 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002715 Py_DECREF(x);
2716 goto onError;
2717 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002718 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002719 }
2720 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002721 int targetsize = PyUnicode_GET_SIZE(x);
2722
2723 if (targetsize == 1)
2724 /* 1-1 mapping */
2725 *p++ = *PyUnicode_AS_UNICODE(x);
2726
2727 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002729 if (targetsize > extrachars) {
2730 /* resize first */
2731 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2732 int needed = (targetsize - extrachars) + \
2733 (targetsize << 2);
2734 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002735 if (_PyUnicode_Resize(&v,
2736 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002737 Py_DECREF(x);
2738 goto onError;
2739 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002740 p = PyUnicode_AS_UNICODE(v) + oldpos;
2741 }
2742 Py_UNICODE_COPY(p,
2743 PyUnicode_AS_UNICODE(x),
2744 targetsize);
2745 p += targetsize;
2746 extrachars -= targetsize;
2747 }
2748 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749 }
2750 else {
2751 /* wrong return value */
2752 PyErr_SetString(PyExc_TypeError,
2753 "character mapping must return integer, None or unicode");
2754 Py_DECREF(x);
2755 goto onError;
2756 }
2757 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002758 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 }
2760 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002761 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002763 Py_XDECREF(errorHandler);
2764 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765 return (PyObject *)v;
2766
2767 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002768 Py_XDECREF(errorHandler);
2769 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002770 Py_XDECREF(v);
2771 return NULL;
2772}
2773
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002774/* Lookup the character ch in the mapping. If the character
2775 can't be found, Py_None is returned (or NULL, if another
2776 error occured). */
2777static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002779 PyObject *w = PyInt_FromLong((long)c);
2780 PyObject *x;
2781
2782 if (w == NULL)
2783 return NULL;
2784 x = PyObject_GetItem(mapping, w);
2785 Py_DECREF(w);
2786 if (x == NULL) {
2787 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2788 /* No mapping found means: mapping is undefined. */
2789 PyErr_Clear();
2790 x = Py_None;
2791 Py_INCREF(x);
2792 return x;
2793 } else
2794 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002796 else if (x == Py_None)
2797 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002798 else if (PyInt_Check(x)) {
2799 long value = PyInt_AS_LONG(x);
2800 if (value < 0 || value > 255) {
2801 PyErr_SetString(PyExc_TypeError,
2802 "character mapping must be in range(256)");
2803 Py_DECREF(x);
2804 return NULL;
2805 }
2806 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002808 else if (PyString_Check(x))
2809 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002811 /* wrong return value */
2812 PyErr_SetString(PyExc_TypeError,
2813 "character mapping must return integer, None or str");
2814 Py_DECREF(x);
2815 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816 }
2817}
2818
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002819/* lookup the character, put the result in the output string and adjust
2820 various state variables. Reallocate the output string if not enough
2821 space is available. Return a new reference to the object that
2822 was put in the output buffer, or Py_None, if the mapping was undefined
2823 (in which case no character was written) or NULL, if a
2824 reallocation error ocurred. The called must decref the result */
2825static
2826PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2827 PyObject **outobj, int *outpos)
2828{
2829 PyObject *rep = charmapencode_lookup(c, mapping);
2830
2831 if (rep==NULL)
2832 return NULL;
2833 else if (rep==Py_None)
2834 return rep;
2835 else {
2836 char *outstart = PyString_AS_STRING(*outobj);
2837 int outsize = PyString_GET_SIZE(*outobj);
2838 if (PyInt_Check(rep)) {
2839 int requiredsize = *outpos+1;
2840 if (outsize<requiredsize) {
2841 /* exponentially overallocate to minimize reallocations */
2842 if (requiredsize < 2*outsize)
2843 requiredsize = 2*outsize;
2844 if (_PyString_Resize(outobj, requiredsize)) {
2845 Py_DECREF(rep);
2846 return NULL;
2847 }
2848 outstart = PyString_AS_STRING(*outobj);
2849 }
2850 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2851 }
2852 else {
2853 const char *repchars = PyString_AS_STRING(rep);
2854 int repsize = PyString_GET_SIZE(rep);
2855 int requiredsize = *outpos+repsize;
2856 if (outsize<requiredsize) {
2857 /* exponentially overallocate to minimize reallocations */
2858 if (requiredsize < 2*outsize)
2859 requiredsize = 2*outsize;
2860 if (_PyString_Resize(outobj, requiredsize)) {
2861 Py_DECREF(rep);
2862 return NULL;
2863 }
2864 outstart = PyString_AS_STRING(*outobj);
2865 }
2866 memcpy(outstart + *outpos, repchars, repsize);
2867 *outpos += repsize;
2868 }
2869 }
2870 return rep;
2871}
2872
2873/* handle an error in PyUnicode_EncodeCharmap
2874 Return 0 on success, -1 on error */
2875static
2876int charmap_encoding_error(
2877 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2878 PyObject **exceptionObject,
2879 int *known_errorHandler, PyObject *errorHandler, const char *errors,
2880 PyObject **res, int *respos)
2881{
2882 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2883 int repsize;
2884 int newpos;
2885 Py_UNICODE *uni2;
2886 /* startpos for collecting unencodable chars */
2887 int collstartpos = *inpos;
2888 int collendpos = *inpos+1;
2889 int collpos;
2890 char *encoding = "charmap";
2891 char *reason = "character maps to <undefined>";
2892
2893 PyObject *x;
2894 /* find all unencodable characters */
2895 while (collendpos < size) {
2896 x = charmapencode_lookup(p[collendpos], mapping);
2897 if (x==NULL)
2898 return -1;
2899 else if (x!=Py_None) {
2900 Py_DECREF(x);
2901 break;
2902 }
2903 Py_DECREF(x);
2904 ++collendpos;
2905 }
2906 /* cache callback name lookup
2907 * (if not done yet, i.e. it's the first error) */
2908 if (*known_errorHandler==-1) {
2909 if ((errors==NULL) || (!strcmp(errors, "strict")))
2910 *known_errorHandler = 1;
2911 else if (!strcmp(errors, "replace"))
2912 *known_errorHandler = 2;
2913 else if (!strcmp(errors, "ignore"))
2914 *known_errorHandler = 3;
2915 else if (!strcmp(errors, "xmlcharrefreplace"))
2916 *known_errorHandler = 4;
2917 else
2918 *known_errorHandler = 0;
2919 }
2920 switch (*known_errorHandler) {
2921 case 1: /* strict */
2922 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2923 return -1;
2924 case 2: /* replace */
2925 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2926 x = charmapencode_output('?', mapping, res, respos);
2927 if (x==NULL) {
2928 return -1;
2929 }
2930 else if (x==Py_None) {
2931 Py_DECREF(x);
2932 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2933 return -1;
2934 }
2935 Py_DECREF(x);
2936 }
2937 /* fall through */
2938 case 3: /* ignore */
2939 *inpos = collendpos;
2940 break;
2941 case 4: /* xmlcharrefreplace */
2942 /* generate replacement (temporarily (mis)uses p) */
2943 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2944 char buffer[2+29+1+1];
2945 char *cp;
2946 sprintf(buffer, "&#%d;", (int)p[collpos]);
2947 for (cp = buffer; *cp; ++cp) {
2948 x = charmapencode_output(*cp, mapping, res, respos);
2949 if (x==NULL)
2950 return -1;
2951 else if (x==Py_None) {
2952 Py_DECREF(x);
2953 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2954 return -1;
2955 }
2956 Py_DECREF(x);
2957 }
2958 }
2959 *inpos = collendpos;
2960 break;
2961 default:
2962 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2963 encoding, reason, p, size, exceptionObject,
2964 collstartpos, collendpos, &newpos);
2965 if (repunicode == NULL)
2966 return -1;
2967 /* generate replacement */
2968 repsize = PyUnicode_GET_SIZE(repunicode);
2969 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2970 x = charmapencode_output(*uni2, mapping, res, respos);
2971 if (x==NULL) {
2972 Py_DECREF(repunicode);
2973 return -1;
2974 }
2975 else if (x==Py_None) {
2976 Py_DECREF(repunicode);
2977 Py_DECREF(x);
2978 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2979 return -1;
2980 }
2981 Py_DECREF(x);
2982 }
2983 *inpos = newpos;
2984 Py_DECREF(repunicode);
2985 }
2986 return 0;
2987}
2988
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2990 int size,
2991 PyObject *mapping,
2992 const char *errors)
2993{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002994 /* output object */
2995 PyObject *res = NULL;
2996 /* current input position */
2997 int inpos = 0;
2998 /* current output position */
2999 int respos = 0;
3000 PyObject *errorHandler = NULL;
3001 PyObject *exc = NULL;
3002 /* the following variable is used for caching string comparisons
3003 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3004 * 3=ignore, 4=xmlcharrefreplace */
3005 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003006
3007 /* Default to Latin-1 */
3008 if (mapping == NULL)
3009 return PyUnicode_EncodeLatin1(p, size, errors);
3010
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003011 /* allocate enough for a simple encoding without
3012 replacements, if we need more, we'll resize */
3013 res = PyString_FromStringAndSize(NULL, size);
3014 if (res == NULL)
3015 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003016 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003017 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003019 while (inpos<size) {
3020 /* try to encode it */
3021 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3022 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003023 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003024 if (x==Py_None) { /* unencodable character */
3025 if (charmap_encoding_error(p, size, &inpos, mapping,
3026 &exc,
3027 &known_errorHandler, errorHandler, errors,
3028 &res, &respos))
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003029 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003031 else
3032 /* done with this character => adjust input position */
3033 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 Py_DECREF(x);
3035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003037 /* Resize if we allocated to much */
3038 if (respos<PyString_GET_SIZE(res)) {
3039 if (_PyString_Resize(&res, respos))
3040 goto onError;
3041 }
3042 Py_XDECREF(exc);
3043 Py_XDECREF(errorHandler);
3044 return res;
3045
3046 onError:
3047 Py_XDECREF(res);
3048 Py_XDECREF(exc);
3049 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 return NULL;
3051}
3052
3053PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3054 PyObject *mapping)
3055{
3056 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3057 PyErr_BadArgument();
3058 return NULL;
3059 }
3060 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3061 PyUnicode_GET_SIZE(unicode),
3062 mapping,
3063 NULL);
3064}
3065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003066/* create or adjust a UnicodeTranslateError */
3067static void make_translate_exception(PyObject **exceptionObject,
3068 const Py_UNICODE *unicode, int size,
3069 int startpos, int endpos,
3070 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003072 if (*exceptionObject == NULL) {
3073 *exceptionObject = PyUnicodeTranslateError_Create(
3074 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 }
3076 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003077 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3078 goto onError;
3079 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3080 goto onError;
3081 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3082 goto onError;
3083 return;
3084 onError:
3085 Py_DECREF(*exceptionObject);
3086 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 }
3088}
3089
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003090/* raises a UnicodeTranslateError */
3091static void raise_translate_exception(PyObject **exceptionObject,
3092 const Py_UNICODE *unicode, int size,
3093 int startpos, int endpos,
3094 const char *reason)
3095{
3096 make_translate_exception(exceptionObject,
3097 unicode, size, startpos, endpos, reason);
3098 if (*exceptionObject != NULL)
3099 PyCodec_StrictErrors(*exceptionObject);
3100}
3101
3102/* error handling callback helper:
3103 build arguments, call the callback and check the arguments,
3104 put the result into newpos and return the replacement string, which
3105 has to be freed by the caller */
3106static PyObject *unicode_translate_call_errorhandler(const char *errors,
3107 PyObject **errorHandler,
3108 const char *reason,
3109 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3110 int startpos, int endpos,
3111 int *newpos)
3112{
3113 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3114
3115 PyObject *restuple;
3116 PyObject *resunicode;
3117
3118 if (*errorHandler == NULL) {
3119 *errorHandler = PyCodec_LookupError(errors);
3120 if (*errorHandler == NULL)
3121 return NULL;
3122 }
3123
3124 make_translate_exception(exceptionObject,
3125 unicode, size, startpos, endpos, reason);
3126 if (*exceptionObject == NULL)
3127 return NULL;
3128
3129 restuple = PyObject_CallFunctionObjArgs(
3130 *errorHandler, *exceptionObject, NULL);
3131 if (restuple == NULL)
3132 return NULL;
3133 if (!PyTuple_Check(restuple)) {
3134 PyErr_Format(PyExc_TypeError, &argparse[4]);
3135 Py_DECREF(restuple);
3136 return NULL;
3137 }
3138 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3139 &resunicode, newpos)) {
3140 Py_DECREF(restuple);
3141 return NULL;
3142 }
3143 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003144 *newpos = size+*newpos;
3145 if (*newpos<0 || *newpos>size) {
3146 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3147 Py_DECREF(restuple);
3148 return NULL;
3149 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003150 Py_INCREF(resunicode);
3151 Py_DECREF(restuple);
3152 return resunicode;
3153}
3154
3155/* Lookup the character ch in the mapping and put the result in result,
3156 which must be decrefed by the caller.
3157 Return 0 on success, -1 on error */
3158static
3159int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3160{
3161 PyObject *w = PyInt_FromLong((long)c);
3162 PyObject *x;
3163
3164 if (w == NULL)
3165 return -1;
3166 x = PyObject_GetItem(mapping, w);
3167 Py_DECREF(w);
3168 if (x == NULL) {
3169 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3170 /* No mapping found means: use 1:1 mapping. */
3171 PyErr_Clear();
3172 *result = NULL;
3173 return 0;
3174 } else
3175 return -1;
3176 }
3177 else if (x == Py_None) {
3178 *result = x;
3179 return 0;
3180 }
3181 else if (PyInt_Check(x)) {
3182 long value = PyInt_AS_LONG(x);
3183 long max = PyUnicode_GetMax();
3184 if (value < 0 || value > max) {
3185 PyErr_Format(PyExc_TypeError,
3186 "character mapping must be in range(0x%lx)", max+1);
3187 Py_DECREF(x);
3188 return -1;
3189 }
3190 *result = x;
3191 return 0;
3192 }
3193 else if (PyUnicode_Check(x)) {
3194 *result = x;
3195 return 0;
3196 }
3197 else {
3198 /* wrong return value */
3199 PyErr_SetString(PyExc_TypeError,
3200 "character mapping must return integer, None or unicode");
3201 return -1;
3202 }
3203}
3204/* ensure that *outobj is at least requiredsize characters long,
3205if not reallocate and adjust various state variables.
3206Return 0 on success, -1 on error */
3207static
3208int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize,
3209 int requiredsize)
3210{
3211 if (requiredsize > *outsize) {
3212 /* remember old output position */
3213 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3214 /* exponentially overallocate to minimize reallocations */
3215 if (requiredsize < 2 * *outsize)
3216 requiredsize = 2 * *outsize;
3217 if (_PyUnicode_Resize(outobj, requiredsize))
3218 return -1;
3219 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3220 *outsize = requiredsize;
3221 }
3222 return 0;
3223}
3224/* lookup the character, put the result in the output string and adjust
3225 various state variables. Return a new reference to the object that
3226 was put in the output buffer in *result, or Py_None, if the mapping was
3227 undefined (in which case no character was written).
3228 The called must decref result.
3229 Return 0 on success, -1 on error. */
3230static
3231int charmaptranslate_output(Py_UNICODE c, PyObject *mapping,
3232 PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res)
3233{
3234 if (charmaptranslate_lookup(c, mapping, res))
3235 return -1;
3236 if (*res==NULL) {
3237 /* not found => default to 1:1 mapping */
3238 *(*outp)++ = (Py_UNICODE)c;
3239 }
3240 else if (*res==Py_None)
3241 ;
3242 else if (PyInt_Check(*res)) {
3243 /* no overflow check, because we know that the space is enough */
3244 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3245 }
3246 else if (PyUnicode_Check(*res)) {
3247 int repsize = PyUnicode_GET_SIZE(*res);
3248 if (repsize==1) {
3249 /* no overflow check, because we know that the space is enough */
3250 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3251 }
3252 else if (repsize!=0) {
3253 /* more than one character */
3254 int requiredsize = *outsize + repsize - 1;
3255 if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize))
3256 return -1;
3257 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3258 *outp += repsize;
3259 }
3260 }
3261 else
3262 return -1;
3263 return 0;
3264}
3265
3266PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 int size,
3268 PyObject *mapping,
3269 const char *errors)
3270{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003271 /* output object */
3272 PyObject *res = NULL;
3273 /* pointers to the beginning and end+1 of input */
3274 const Py_UNICODE *startp = p;
3275 const Py_UNICODE *endp = p + size;
3276 /* pointer into the output */
3277 Py_UNICODE *str;
3278 /* current output position */
3279 int respos = 0;
3280 int ressize;
3281 char *reason = "character maps to <undefined>";
3282 PyObject *errorHandler = NULL;
3283 PyObject *exc = NULL;
3284 /* the following variable is used for caching string comparisons
3285 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3286 * 3=ignore, 4=xmlcharrefreplace */
3287 int known_errorHandler = -1;
3288
Guido van Rossumd57fd912000-03-10 22:53:23 +00003289 if (mapping == NULL) {
3290 PyErr_BadArgument();
3291 return NULL;
3292 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003293
3294 /* allocate enough for a simple 1:1 translation without
3295 replacements, if we need more, we'll resize */
3296 res = PyUnicode_FromUnicode(NULL, size);
3297 if (res == NULL)
3298 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003300 return res;
3301 str = PyUnicode_AS_UNICODE(res);
3302 ressize = size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003304 while (p<endp) {
3305 /* try to encode it */
3306 PyObject *x = NULL;
3307 if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) {
3308 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309 goto onError;
3310 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003311 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003312 if (x!=Py_None) /* it worked => adjust input pointer */
3313 ++p;
3314 else { /* untranslatable character */
3315 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3316 int repsize;
3317 int newpos;
3318 Py_UNICODE *uni2;
3319 /* startpos for collecting untranslatable chars */
3320 const Py_UNICODE *collstart = p;
3321 const Py_UNICODE *collend = p+1;
3322 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003324 /* find all untranslatable characters */
3325 while (collend < endp) {
3326 if (charmaptranslate_lookup(*collend, mapping, &x))
3327 goto onError;
3328 Py_XDECREF(x);
3329 if (x!=Py_None)
3330 break;
3331 ++collend;
3332 }
3333 /* cache callback name lookup
3334 * (if not done yet, i.e. it's the first error) */
3335 if (known_errorHandler==-1) {
3336 if ((errors==NULL) || (!strcmp(errors, "strict")))
3337 known_errorHandler = 1;
3338 else if (!strcmp(errors, "replace"))
3339 known_errorHandler = 2;
3340 else if (!strcmp(errors, "ignore"))
3341 known_errorHandler = 3;
3342 else if (!strcmp(errors, "xmlcharrefreplace"))
3343 known_errorHandler = 4;
3344 else
3345 known_errorHandler = 0;
3346 }
3347 switch (known_errorHandler) {
3348 case 1: /* strict */
3349 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3350 goto onError;
3351 case 2: /* replace */
3352 /* No need to check for space, this is a 1:1 replacement */
3353 for (coll = collstart; coll<collend; ++coll)
3354 *str++ = '?';
3355 /* fall through */
3356 case 3: /* ignore */
3357 p = collend;
3358 break;
3359 case 4: /* xmlcharrefreplace */
3360 /* generate replacement (temporarily (mis)uses p) */
3361 for (p = collstart; p < collend; ++p) {
3362 char buffer[2+29+1+1];
3363 char *cp;
3364 sprintf(buffer, "&#%d;", (int)*p);
3365 if (charmaptranslate_makespace(&res, &str, &ressize,
3366 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3367 goto onError;
3368 for (cp = buffer; *cp; ++cp)
3369 *str++ = *cp;
3370 }
3371 p = collend;
3372 break;
3373 default:
3374 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3375 reason, startp, size, &exc,
3376 collstart-startp, collend-startp, &newpos);
3377 if (repunicode == NULL)
3378 goto onError;
3379 /* generate replacement */
3380 repsize = PyUnicode_GET_SIZE(repunicode);
3381 if (charmaptranslate_makespace(&res, &str, &ressize,
3382 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3383 Py_DECREF(repunicode);
3384 goto onError;
3385 }
3386 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3387 *str++ = *uni2;
3388 p = startp + newpos;
3389 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003390 }
3391 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003392 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003393 /* Resize if we allocated to much */
3394 respos = str-PyUnicode_AS_UNICODE(res);
3395 if (respos<ressize) {
3396 if (_PyUnicode_Resize(&res, respos))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003397 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003398 }
3399 Py_XDECREF(exc);
3400 Py_XDECREF(errorHandler);
3401 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003402
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003403 onError:
3404 Py_XDECREF(res);
3405 Py_XDECREF(exc);
3406 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 return NULL;
3408}
3409
3410PyObject *PyUnicode_Translate(PyObject *str,
3411 PyObject *mapping,
3412 const char *errors)
3413{
3414 PyObject *result;
3415
3416 str = PyUnicode_FromObject(str);
3417 if (str == NULL)
3418 goto onError;
3419 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3420 PyUnicode_GET_SIZE(str),
3421 mapping,
3422 errors);
3423 Py_DECREF(str);
3424 return result;
3425
3426 onError:
3427 Py_XDECREF(str);
3428 return NULL;
3429}
3430
Guido van Rossum9e896b32000-04-05 20:11:21 +00003431/* --- Decimal Encoder ---------------------------------------------------- */
3432
3433int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3434 int length,
3435 char *output,
3436 const char *errors)
3437{
3438 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003439 PyObject *errorHandler = NULL;
3440 PyObject *exc = NULL;
3441 const char *encoding = "decimal";
3442 const char *reason = "invalid decimal Unicode string";
3443 /* the following variable is used for caching string comparisons
3444 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3445 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003446
3447 if (output == NULL) {
3448 PyErr_BadArgument();
3449 return -1;
3450 }
3451
3452 p = s;
3453 end = s + length;
3454 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003455 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003456 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003457 PyObject *repunicode;
3458 int repsize;
3459 int newpos;
3460 Py_UNICODE *uni2;
3461 Py_UNICODE *collstart;
3462 Py_UNICODE *collend;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003463
3464 if (Py_UNICODE_ISSPACE(ch)) {
3465 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003467 continue;
3468 }
3469 decimal = Py_UNICODE_TODECIMAL(ch);
3470 if (decimal >= 0) {
3471 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003472 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003473 continue;
3474 }
Guido van Rossumba477042000-04-06 18:18:10 +00003475 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003476 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003477 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003478 continue;
3479 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003480 /* All other characters are considered unencodable */
3481 collstart = p;
3482 collend = p+1;
3483 while (collend < end) {
3484 if ((0 < *collend && *collend < 256) ||
3485 !Py_UNICODE_ISSPACE(*collend) ||
3486 Py_UNICODE_TODECIMAL(*collend))
3487 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003488 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003489 /* cache callback name lookup
3490 * (if not done yet, i.e. it's the first error) */
3491 if (known_errorHandler==-1) {
3492 if ((errors==NULL) || (!strcmp(errors, "strict")))
3493 known_errorHandler = 1;
3494 else if (!strcmp(errors, "replace"))
3495 known_errorHandler = 2;
3496 else if (!strcmp(errors, "ignore"))
3497 known_errorHandler = 3;
3498 else if (!strcmp(errors, "xmlcharrefreplace"))
3499 known_errorHandler = 4;
3500 else
3501 known_errorHandler = 0;
3502 }
3503 switch (known_errorHandler) {
3504 case 1: /* strict */
3505 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3506 goto onError;
3507 case 2: /* replace */
3508 for (p = collstart; p < collend; ++p)
3509 *output++ = '?';
3510 /* fall through */
3511 case 3: /* ignore */
3512 p = collend;
3513 break;
3514 case 4: /* xmlcharrefreplace */
3515 /* generate replacement (temporarily (mis)uses p) */
3516 for (p = collstart; p < collend; ++p)
3517 output += sprintf(output, "&#%d;", (int)*p);
3518 p = collend;
3519 break;
3520 default:
3521 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3522 encoding, reason, s, length, &exc,
3523 collstart-s, collend-s, &newpos);
3524 if (repunicode == NULL)
3525 goto onError;
3526 /* generate replacement */
3527 repsize = PyUnicode_GET_SIZE(repunicode);
3528 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3529 Py_UNICODE ch = *uni2;
3530 if (Py_UNICODE_ISSPACE(ch))
3531 *output++ = ' ';
3532 else {
3533 decimal = Py_UNICODE_TODECIMAL(ch);
3534 if (decimal >= 0)
3535 *output++ = '0' + decimal;
3536 else if (0 < ch && ch < 256)
3537 *output++ = (char)ch;
3538 else {
3539 Py_DECREF(repunicode);
3540 raise_encode_exception(&exc, encoding,
3541 s, length, collstart-s, collend-s, reason);
3542 goto onError;
3543 }
3544 }
3545 }
3546 p = s + newpos;
3547 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003548 }
3549 }
3550 /* 0-terminate the output string */
3551 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003552 Py_XDECREF(exc);
3553 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003554 return 0;
3555
3556 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 Py_XDECREF(exc);
3558 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003559 return -1;
3560}
3561
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562/* --- Helpers ------------------------------------------------------------ */
3563
3564static
3565int count(PyUnicodeObject *self,
3566 int start,
3567 int end,
3568 PyUnicodeObject *substring)
3569{
3570 int count = 0;
3571
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003572 if (start < 0)
3573 start += self->length;
3574 if (start < 0)
3575 start = 0;
3576 if (end > self->length)
3577 end = self->length;
3578 if (end < 0)
3579 end += self->length;
3580 if (end < 0)
3581 end = 0;
3582
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003583 if (substring->length == 0)
3584 return (end - start + 1);
3585
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586 end -= substring->length;
3587
3588 while (start <= end)
3589 if (Py_UNICODE_MATCH(self, start, substring)) {
3590 count++;
3591 start += substring->length;
3592 } else
3593 start++;
3594
3595 return count;
3596}
3597
3598int PyUnicode_Count(PyObject *str,
3599 PyObject *substr,
3600 int start,
3601 int end)
3602{
3603 int result;
3604
3605 str = PyUnicode_FromObject(str);
3606 if (str == NULL)
3607 return -1;
3608 substr = PyUnicode_FromObject(substr);
3609 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003610 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611 return -1;
3612 }
3613
3614 result = count((PyUnicodeObject *)str,
3615 start, end,
3616 (PyUnicodeObject *)substr);
3617
3618 Py_DECREF(str);
3619 Py_DECREF(substr);
3620 return result;
3621}
3622
3623static
3624int findstring(PyUnicodeObject *self,
3625 PyUnicodeObject *substring,
3626 int start,
3627 int end,
3628 int direction)
3629{
3630 if (start < 0)
3631 start += self->length;
3632 if (start < 0)
3633 start = 0;
3634
Guido van Rossumd57fd912000-03-10 22:53:23 +00003635 if (end > self->length)
3636 end = self->length;
3637 if (end < 0)
3638 end += self->length;
3639 if (end < 0)
3640 end = 0;
3641
Guido van Rossum76afbd92002-08-20 17:29:29 +00003642 if (substring->length == 0)
3643 return (direction > 0) ? start : end;
3644
Guido van Rossumd57fd912000-03-10 22:53:23 +00003645 end -= substring->length;
3646
3647 if (direction < 0) {
3648 for (; end >= start; end--)
3649 if (Py_UNICODE_MATCH(self, end, substring))
3650 return end;
3651 } else {
3652 for (; start <= end; start++)
3653 if (Py_UNICODE_MATCH(self, start, substring))
3654 return start;
3655 }
3656
3657 return -1;
3658}
3659
3660int PyUnicode_Find(PyObject *str,
3661 PyObject *substr,
3662 int start,
3663 int end,
3664 int direction)
3665{
3666 int result;
3667
3668 str = PyUnicode_FromObject(str);
3669 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003670 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671 substr = PyUnicode_FromObject(substr);
3672 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003673 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003674 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003675 }
3676
3677 result = findstring((PyUnicodeObject *)str,
3678 (PyUnicodeObject *)substr,
3679 start, end, direction);
3680 Py_DECREF(str);
3681 Py_DECREF(substr);
3682 return result;
3683}
3684
3685static
3686int tailmatch(PyUnicodeObject *self,
3687 PyUnicodeObject *substring,
3688 int start,
3689 int end,
3690 int direction)
3691{
3692 if (start < 0)
3693 start += self->length;
3694 if (start < 0)
3695 start = 0;
3696
3697 if (substring->length == 0)
3698 return 1;
3699
3700 if (end > self->length)
3701 end = self->length;
3702 if (end < 0)
3703 end += self->length;
3704 if (end < 0)
3705 end = 0;
3706
3707 end -= substring->length;
3708 if (end < start)
3709 return 0;
3710
3711 if (direction > 0) {
3712 if (Py_UNICODE_MATCH(self, end, substring))
3713 return 1;
3714 } else {
3715 if (Py_UNICODE_MATCH(self, start, substring))
3716 return 1;
3717 }
3718
3719 return 0;
3720}
3721
3722int PyUnicode_Tailmatch(PyObject *str,
3723 PyObject *substr,
3724 int start,
3725 int end,
3726 int direction)
3727{
3728 int result;
3729
3730 str = PyUnicode_FromObject(str);
3731 if (str == NULL)
3732 return -1;
3733 substr = PyUnicode_FromObject(substr);
3734 if (substr == NULL) {
3735 Py_DECREF(substr);
3736 return -1;
3737 }
3738
3739 result = tailmatch((PyUnicodeObject *)str,
3740 (PyUnicodeObject *)substr,
3741 start, end, direction);
3742 Py_DECREF(str);
3743 Py_DECREF(substr);
3744 return result;
3745}
3746
3747static
3748const Py_UNICODE *findchar(const Py_UNICODE *s,
3749 int size,
3750 Py_UNICODE ch)
3751{
3752 /* like wcschr, but doesn't stop at NULL characters */
3753
3754 while (size-- > 0) {
3755 if (*s == ch)
3756 return s;
3757 s++;
3758 }
3759
3760 return NULL;
3761}
3762
3763/* Apply fixfct filter to the Unicode object self and return a
3764 reference to the modified object */
3765
3766static
3767PyObject *fixup(PyUnicodeObject *self,
3768 int (*fixfct)(PyUnicodeObject *s))
3769{
3770
3771 PyUnicodeObject *u;
3772
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003773 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003774 if (u == NULL)
3775 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003776
3777 Py_UNICODE_COPY(u->str, self->str, self->length);
3778
Tim Peters7a29bd52001-09-12 03:03:31 +00003779 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 /* fixfct should return TRUE if it modified the buffer. If
3781 FALSE, return a reference to the original buffer instead
3782 (to save space, not time) */
3783 Py_INCREF(self);
3784 Py_DECREF(u);
3785 return (PyObject*) self;
3786 }
3787 return (PyObject*) u;
3788}
3789
3790static
3791int fixupper(PyUnicodeObject *self)
3792{
3793 int len = self->length;
3794 Py_UNICODE *s = self->str;
3795 int status = 0;
3796
3797 while (len-- > 0) {
3798 register Py_UNICODE ch;
3799
3800 ch = Py_UNICODE_TOUPPER(*s);
3801 if (ch != *s) {
3802 status = 1;
3803 *s = ch;
3804 }
3805 s++;
3806 }
3807
3808 return status;
3809}
3810
3811static
3812int fixlower(PyUnicodeObject *self)
3813{
3814 int len = self->length;
3815 Py_UNICODE *s = self->str;
3816 int status = 0;
3817
3818 while (len-- > 0) {
3819 register Py_UNICODE ch;
3820
3821 ch = Py_UNICODE_TOLOWER(*s);
3822 if (ch != *s) {
3823 status = 1;
3824 *s = ch;
3825 }
3826 s++;
3827 }
3828
3829 return status;
3830}
3831
3832static
3833int fixswapcase(PyUnicodeObject *self)
3834{
3835 int len = self->length;
3836 Py_UNICODE *s = self->str;
3837 int status = 0;
3838
3839 while (len-- > 0) {
3840 if (Py_UNICODE_ISUPPER(*s)) {
3841 *s = Py_UNICODE_TOLOWER(*s);
3842 status = 1;
3843 } else if (Py_UNICODE_ISLOWER(*s)) {
3844 *s = Py_UNICODE_TOUPPER(*s);
3845 status = 1;
3846 }
3847 s++;
3848 }
3849
3850 return status;
3851}
3852
3853static
3854int fixcapitalize(PyUnicodeObject *self)
3855{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003856 int len = self->length;
3857 Py_UNICODE *s = self->str;
3858 int status = 0;
3859
3860 if (len == 0)
3861 return 0;
3862 if (Py_UNICODE_ISLOWER(*s)) {
3863 *s = Py_UNICODE_TOUPPER(*s);
3864 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003865 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003866 s++;
3867 while (--len > 0) {
3868 if (Py_UNICODE_ISUPPER(*s)) {
3869 *s = Py_UNICODE_TOLOWER(*s);
3870 status = 1;
3871 }
3872 s++;
3873 }
3874 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875}
3876
3877static
3878int fixtitle(PyUnicodeObject *self)
3879{
3880 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3881 register Py_UNICODE *e;
3882 int previous_is_cased;
3883
3884 /* Shortcut for single character strings */
3885 if (PyUnicode_GET_SIZE(self) == 1) {
3886 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3887 if (*p != ch) {
3888 *p = ch;
3889 return 1;
3890 }
3891 else
3892 return 0;
3893 }
3894
3895 e = p + PyUnicode_GET_SIZE(self);
3896 previous_is_cased = 0;
3897 for (; p < e; p++) {
3898 register const Py_UNICODE ch = *p;
3899
3900 if (previous_is_cased)
3901 *p = Py_UNICODE_TOLOWER(ch);
3902 else
3903 *p = Py_UNICODE_TOTITLE(ch);
3904
3905 if (Py_UNICODE_ISLOWER(ch) ||
3906 Py_UNICODE_ISUPPER(ch) ||
3907 Py_UNICODE_ISTITLE(ch))
3908 previous_is_cased = 1;
3909 else
3910 previous_is_cased = 0;
3911 }
3912 return 1;
3913}
3914
3915PyObject *PyUnicode_Join(PyObject *separator,
3916 PyObject *seq)
3917{
3918 Py_UNICODE *sep;
3919 int seplen;
3920 PyUnicodeObject *res = NULL;
3921 int reslen = 0;
3922 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923 int sz = 100;
3924 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003925 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003926
Tim Peters2cfe3682001-05-05 05:36:48 +00003927 it = PyObject_GetIter(seq);
3928 if (it == NULL)
3929 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930
3931 if (separator == NULL) {
3932 Py_UNICODE blank = ' ';
3933 sep = &blank;
3934 seplen = 1;
3935 }
3936 else {
3937 separator = PyUnicode_FromObject(separator);
3938 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003939 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940 sep = PyUnicode_AS_UNICODE(separator);
3941 seplen = PyUnicode_GET_SIZE(separator);
3942 }
3943
3944 res = _PyUnicode_New(sz);
3945 if (res == NULL)
3946 goto onError;
3947 p = PyUnicode_AS_UNICODE(res);
3948 reslen = 0;
3949
Tim Peters2cfe3682001-05-05 05:36:48 +00003950 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003952 PyObject *item = PyIter_Next(it);
3953 if (item == NULL) {
3954 if (PyErr_Occurred())
3955 goto onError;
3956 break;
3957 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 if (!PyUnicode_Check(item)) {
3959 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003960 if (!PyString_Check(item)) {
3961 PyErr_Format(PyExc_TypeError,
3962 "sequence item %i: expected string or Unicode,"
3963 " %.80s found",
3964 i, item->ob_type->tp_name);
3965 Py_DECREF(item);
3966 goto onError;
3967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968 v = PyUnicode_FromObject(item);
3969 Py_DECREF(item);
3970 item = v;
3971 if (item == NULL)
3972 goto onError;
3973 }
3974 itemlen = PyUnicode_GET_SIZE(item);
3975 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003976 if (_PyUnicode_Resize(&res, sz*2)) {
3977 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003979 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980 sz *= 2;
3981 p = PyUnicode_AS_UNICODE(res) + reslen;
3982 }
3983 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003984 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985 p += seplen;
3986 reslen += seplen;
3987 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003988 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989 p += itemlen;
3990 reslen += itemlen;
3991 Py_DECREF(item);
3992 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003993 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003994 goto onError;
3995
3996 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003997 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 return (PyObject *)res;
3999
4000 onError:
4001 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004002 Py_XDECREF(res);
4003 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 return NULL;
4005}
4006
4007static
4008PyUnicodeObject *pad(PyUnicodeObject *self,
4009 int left,
4010 int right,
4011 Py_UNICODE fill)
4012{
4013 PyUnicodeObject *u;
4014
4015 if (left < 0)
4016 left = 0;
4017 if (right < 0)
4018 right = 0;
4019
Tim Peters7a29bd52001-09-12 03:03:31 +00004020 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021 Py_INCREF(self);
4022 return self;
4023 }
4024
4025 u = _PyUnicode_New(left + self->length + right);
4026 if (u) {
4027 if (left)
4028 Py_UNICODE_FILL(u->str, fill, left);
4029 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4030 if (right)
4031 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4032 }
4033
4034 return u;
4035}
4036
4037#define SPLIT_APPEND(data, left, right) \
4038 str = PyUnicode_FromUnicode(data + left, right - left); \
4039 if (!str) \
4040 goto onError; \
4041 if (PyList_Append(list, str)) { \
4042 Py_DECREF(str); \
4043 goto onError; \
4044 } \
4045 else \
4046 Py_DECREF(str);
4047
4048static
4049PyObject *split_whitespace(PyUnicodeObject *self,
4050 PyObject *list,
4051 int maxcount)
4052{
4053 register int i;
4054 register int j;
4055 int len = self->length;
4056 PyObject *str;
4057
4058 for (i = j = 0; i < len; ) {
4059 /* find a token */
4060 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4061 i++;
4062 j = i;
4063 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4064 i++;
4065 if (j < i) {
4066 if (maxcount-- <= 0)
4067 break;
4068 SPLIT_APPEND(self->str, j, i);
4069 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4070 i++;
4071 j = i;
4072 }
4073 }
4074 if (j < len) {
4075 SPLIT_APPEND(self->str, j, len);
4076 }
4077 return list;
4078
4079 onError:
4080 Py_DECREF(list);
4081 return NULL;
4082}
4083
4084PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004085 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004086{
4087 register int i;
4088 register int j;
4089 int len;
4090 PyObject *list;
4091 PyObject *str;
4092 Py_UNICODE *data;
4093
4094 string = PyUnicode_FromObject(string);
4095 if (string == NULL)
4096 return NULL;
4097 data = PyUnicode_AS_UNICODE(string);
4098 len = PyUnicode_GET_SIZE(string);
4099
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100 list = PyList_New(0);
4101 if (!list)
4102 goto onError;
4103
4104 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004105 int eol;
4106
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107 /* Find a line and append it */
4108 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4109 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004110
4111 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004112 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113 if (i < len) {
4114 if (data[i] == '\r' && i + 1 < len &&
4115 data[i+1] == '\n')
4116 i += 2;
4117 else
4118 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004119 if (keepends)
4120 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121 }
Guido van Rossum86662912000-04-11 15:38:46 +00004122 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123 j = i;
4124 }
4125 if (j < len) {
4126 SPLIT_APPEND(data, j, len);
4127 }
4128
4129 Py_DECREF(string);
4130 return list;
4131
4132 onError:
4133 Py_DECREF(list);
4134 Py_DECREF(string);
4135 return NULL;
4136}
4137
4138static
4139PyObject *split_char(PyUnicodeObject *self,
4140 PyObject *list,
4141 Py_UNICODE ch,
4142 int maxcount)
4143{
4144 register int i;
4145 register int j;
4146 int len = self->length;
4147 PyObject *str;
4148
4149 for (i = j = 0; i < len; ) {
4150 if (self->str[i] == ch) {
4151 if (maxcount-- <= 0)
4152 break;
4153 SPLIT_APPEND(self->str, j, i);
4154 i = j = i + 1;
4155 } else
4156 i++;
4157 }
4158 if (j <= len) {
4159 SPLIT_APPEND(self->str, j, len);
4160 }
4161 return list;
4162
4163 onError:
4164 Py_DECREF(list);
4165 return NULL;
4166}
4167
4168static
4169PyObject *split_substring(PyUnicodeObject *self,
4170 PyObject *list,
4171 PyUnicodeObject *substring,
4172 int maxcount)
4173{
4174 register int i;
4175 register int j;
4176 int len = self->length;
4177 int sublen = substring->length;
4178 PyObject *str;
4179
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004180 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181 if (Py_UNICODE_MATCH(self, i, substring)) {
4182 if (maxcount-- <= 0)
4183 break;
4184 SPLIT_APPEND(self->str, j, i);
4185 i = j = i + sublen;
4186 } else
4187 i++;
4188 }
4189 if (j <= len) {
4190 SPLIT_APPEND(self->str, j, len);
4191 }
4192 return list;
4193
4194 onError:
4195 Py_DECREF(list);
4196 return NULL;
4197}
4198
4199#undef SPLIT_APPEND
4200
4201static
4202PyObject *split(PyUnicodeObject *self,
4203 PyUnicodeObject *substring,
4204 int maxcount)
4205{
4206 PyObject *list;
4207
4208 if (maxcount < 0)
4209 maxcount = INT_MAX;
4210
4211 list = PyList_New(0);
4212 if (!list)
4213 return NULL;
4214
4215 if (substring == NULL)
4216 return split_whitespace(self,list,maxcount);
4217
4218 else if (substring->length == 1)
4219 return split_char(self,list,substring->str[0],maxcount);
4220
4221 else if (substring->length == 0) {
4222 Py_DECREF(list);
4223 PyErr_SetString(PyExc_ValueError, "empty separator");
4224 return NULL;
4225 }
4226 else
4227 return split_substring(self,list,substring,maxcount);
4228}
4229
4230static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004231PyObject *replace(PyUnicodeObject *self,
4232 PyUnicodeObject *str1,
4233 PyUnicodeObject *str2,
4234 int maxcount)
4235{
4236 PyUnicodeObject *u;
4237
4238 if (maxcount < 0)
4239 maxcount = INT_MAX;
4240
4241 if (str1->length == 1 && str2->length == 1) {
4242 int i;
4243
4244 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004245 if (!findchar(self->str, self->length, str1->str[0]) &&
4246 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004247 /* nothing to replace, return original string */
4248 Py_INCREF(self);
4249 u = self;
4250 } else {
4251 Py_UNICODE u1 = str1->str[0];
4252 Py_UNICODE u2 = str2->str[0];
4253
4254 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004255 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004256 self->length
4257 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004258 if (u != NULL) {
4259 Py_UNICODE_COPY(u->str, self->str,
4260 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004261 for (i = 0; i < u->length; i++)
4262 if (u->str[i] == u1) {
4263 if (--maxcount < 0)
4264 break;
4265 u->str[i] = u2;
4266 }
4267 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004268 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004269
4270 } else {
4271 int n, i;
4272 Py_UNICODE *p;
4273
4274 /* replace strings */
4275 n = count(self, 0, self->length, str1);
4276 if (n > maxcount)
4277 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004278 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004280 if (PyUnicode_CheckExact(self)) {
4281 Py_INCREF(self);
4282 u = self;
4283 }
4284 else {
4285 u = (PyUnicodeObject *)
4286 PyUnicode_FromUnicode(self->str, self->length);
4287 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288 } else {
4289 u = _PyUnicode_New(
4290 self->length + n * (str2->length - str1->length));
4291 if (u) {
4292 i = 0;
4293 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004294 if (str1->length > 0) {
4295 while (i <= self->length - str1->length)
4296 if (Py_UNICODE_MATCH(self, i, str1)) {
4297 /* replace string segment */
4298 Py_UNICODE_COPY(p, str2->str, str2->length);
4299 p += str2->length;
4300 i += str1->length;
4301 if (--n <= 0) {
4302 /* copy remaining part */
4303 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4304 break;
4305 }
4306 } else
4307 *p++ = self->str[i++];
4308 } else {
4309 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004310 Py_UNICODE_COPY(p, str2->str, str2->length);
4311 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004312 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004314 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004315 }
4316 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4317 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318 }
4319 }
4320 }
4321
4322 return (PyObject *) u;
4323}
4324
4325/* --- Unicode Object Methods --------------------------------------------- */
4326
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004327PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328"S.title() -> unicode\n\
4329\n\
4330Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004331characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332
4333static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004334unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004335{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336 return fixup(self, fixtitle);
4337}
4338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004339PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340"S.capitalize() -> unicode\n\
4341\n\
4342Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004343have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004344
4345static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004346unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004347{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004348 return fixup(self, fixcapitalize);
4349}
4350
4351#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004352PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353"S.capwords() -> unicode\n\
4354\n\
4355Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004356normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357
4358static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004359unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004360{
4361 PyObject *list;
4362 PyObject *item;
4363 int i;
4364
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365 /* Split into words */
4366 list = split(self, NULL, -1);
4367 if (!list)
4368 return NULL;
4369
4370 /* Capitalize each word */
4371 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4372 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4373 fixcapitalize);
4374 if (item == NULL)
4375 goto onError;
4376 Py_DECREF(PyList_GET_ITEM(list, i));
4377 PyList_SET_ITEM(list, i, item);
4378 }
4379
4380 /* Join the words to form a new string */
4381 item = PyUnicode_Join(NULL, list);
4382
4383onError:
4384 Py_DECREF(list);
4385 return (PyObject *)item;
4386}
4387#endif
4388
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004389PyDoc_STRVAR(center__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390"S.center(width) -> unicode\n\
4391\n\
4392Return S centered in a Unicode string of length width. Padding is done\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004393using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394
4395static PyObject *
4396unicode_center(PyUnicodeObject *self, PyObject *args)
4397{
4398 int marg, left;
4399 int width;
4400
4401 if (!PyArg_ParseTuple(args, "i:center", &width))
4402 return NULL;
4403
Tim Peters7a29bd52001-09-12 03:03:31 +00004404 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405 Py_INCREF(self);
4406 return (PyObject*) self;
4407 }
4408
4409 marg = width - self->length;
4410 left = marg / 2 + (marg & width & 1);
4411
4412 return (PyObject*) pad(self, left, marg - left, ' ');
4413}
4414
Marc-André Lemburge5034372000-08-08 08:04:29 +00004415#if 0
4416
4417/* This code should go into some future Unicode collation support
4418 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004419 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004420
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004421/* speedy UTF-16 code point order comparison */
4422/* gleaned from: */
4423/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4424
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004425static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004426{
4427 0, 0, 0, 0, 0, 0, 0, 0,
4428 0, 0, 0, 0, 0, 0, 0, 0,
4429 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004430 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004431};
4432
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433static int
4434unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4435{
4436 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004437
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438 Py_UNICODE *s1 = str1->str;
4439 Py_UNICODE *s2 = str2->str;
4440
4441 len1 = str1->length;
4442 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004443
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004445 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004446
4447 c1 = *s1++;
4448 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004449
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004450 if (c1 > (1<<11) * 26)
4451 c1 += utf16Fixup[c1>>11];
4452 if (c2 > (1<<11) * 26)
4453 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004454 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004455
4456 if (c1 != c2)
4457 return (c1 < c2) ? -1 : 1;
4458
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004459 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 }
4461
4462 return (len1 < len2) ? -1 : (len1 != len2);
4463}
4464
Marc-André Lemburge5034372000-08-08 08:04:29 +00004465#else
4466
4467static int
4468unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4469{
4470 register int len1, len2;
4471
4472 Py_UNICODE *s1 = str1->str;
4473 Py_UNICODE *s2 = str2->str;
4474
4475 len1 = str1->length;
4476 len2 = str2->length;
4477
4478 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00004479 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004480
Fredrik Lundh45714e92001-06-26 16:39:36 +00004481 c1 = *s1++;
4482 c2 = *s2++;
4483
4484 if (c1 != c2)
4485 return (c1 < c2) ? -1 : 1;
4486
Marc-André Lemburge5034372000-08-08 08:04:29 +00004487 len1--; len2--;
4488 }
4489
4490 return (len1 < len2) ? -1 : (len1 != len2);
4491}
4492
4493#endif
4494
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495int PyUnicode_Compare(PyObject *left,
4496 PyObject *right)
4497{
4498 PyUnicodeObject *u = NULL, *v = NULL;
4499 int result;
4500
4501 /* Coerce the two arguments */
4502 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4503 if (u == NULL)
4504 goto onError;
4505 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4506 if (v == NULL)
4507 goto onError;
4508
Thomas Wouters7e474022000-07-16 12:04:32 +00004509 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510 if (v == u) {
4511 Py_DECREF(u);
4512 Py_DECREF(v);
4513 return 0;
4514 }
4515
4516 result = unicode_compare(u, v);
4517
4518 Py_DECREF(u);
4519 Py_DECREF(v);
4520 return result;
4521
4522onError:
4523 Py_XDECREF(u);
4524 Py_XDECREF(v);
4525 return -1;
4526}
4527
Guido van Rossum403d68b2000-03-13 15:55:09 +00004528int PyUnicode_Contains(PyObject *container,
4529 PyObject *element)
4530{
4531 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004532 int result, size;
4533 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004534
4535 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004536 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004537 if (v == NULL) {
4538 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004539 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004540 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004541 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004542 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004543 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004544 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004545
Barry Warsaw817918c2002-08-06 16:58:21 +00004546 size = PyUnicode_GET_SIZE(v);
4547 rhs = PyUnicode_AS_UNICODE(v);
4548 lhs = PyUnicode_AS_UNICODE(u);
4549
Guido van Rossum403d68b2000-03-13 15:55:09 +00004550 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004551 if (size == 1) {
4552 end = lhs + PyUnicode_GET_SIZE(u);
4553 while (lhs < end) {
4554 if (*lhs++ == *rhs) {
4555 result = 1;
4556 break;
4557 }
4558 }
4559 }
4560 else {
4561 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4562 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004563 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004564 result = 1;
4565 break;
4566 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004567 }
4568 }
4569
4570 Py_DECREF(u);
4571 Py_DECREF(v);
4572 return result;
4573
4574onError:
4575 Py_XDECREF(u);
4576 Py_XDECREF(v);
4577 return -1;
4578}
4579
Guido van Rossumd57fd912000-03-10 22:53:23 +00004580/* Concat to string or Unicode object giving a new Unicode object. */
4581
4582PyObject *PyUnicode_Concat(PyObject *left,
4583 PyObject *right)
4584{
4585 PyUnicodeObject *u = NULL, *v = NULL, *w;
4586
4587 /* Coerce the two arguments */
4588 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4589 if (u == NULL)
4590 goto onError;
4591 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4592 if (v == NULL)
4593 goto onError;
4594
4595 /* Shortcuts */
4596 if (v == unicode_empty) {
4597 Py_DECREF(v);
4598 return (PyObject *)u;
4599 }
4600 if (u == unicode_empty) {
4601 Py_DECREF(u);
4602 return (PyObject *)v;
4603 }
4604
4605 /* Concat the two Unicode strings */
4606 w = _PyUnicode_New(u->length + v->length);
4607 if (w == NULL)
4608 goto onError;
4609 Py_UNICODE_COPY(w->str, u->str, u->length);
4610 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4611
4612 Py_DECREF(u);
4613 Py_DECREF(v);
4614 return (PyObject *)w;
4615
4616onError:
4617 Py_XDECREF(u);
4618 Py_XDECREF(v);
4619 return NULL;
4620}
4621
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004622PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623"S.count(sub[, start[, end]]) -> int\n\
4624\n\
4625Return the number of occurrences of substring sub in Unicode string\n\
4626S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004627interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628
4629static PyObject *
4630unicode_count(PyUnicodeObject *self, PyObject *args)
4631{
4632 PyUnicodeObject *substring;
4633 int start = 0;
4634 int end = INT_MAX;
4635 PyObject *result;
4636
Guido van Rossumb8872e62000-05-09 14:14:27 +00004637 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4638 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004639 return NULL;
4640
4641 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4642 (PyObject *)substring);
4643 if (substring == NULL)
4644 return NULL;
4645
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646 if (start < 0)
4647 start += self->length;
4648 if (start < 0)
4649 start = 0;
4650 if (end > self->length)
4651 end = self->length;
4652 if (end < 0)
4653 end += self->length;
4654 if (end < 0)
4655 end = 0;
4656
4657 result = PyInt_FromLong((long) count(self, start, end, substring));
4658
4659 Py_DECREF(substring);
4660 return result;
4661}
4662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004663PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664"S.encode([encoding[,errors]]) -> string\n\
4665\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004666Return an encoded string version of S. Default encoding is the current\n\
4667default string encoding. errors may be given to set a different error\n\
4668handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004669a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4670'xmlcharrefreplace' as well as any other name registered with\n\
4671codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004672
4673static PyObject *
4674unicode_encode(PyUnicodeObject *self, PyObject *args)
4675{
4676 char *encoding = NULL;
4677 char *errors = NULL;
4678 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4679 return NULL;
4680 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4681}
4682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004683PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684"S.expandtabs([tabsize]) -> unicode\n\
4685\n\
4686Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004687If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688
4689static PyObject*
4690unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4691{
4692 Py_UNICODE *e;
4693 Py_UNICODE *p;
4694 Py_UNICODE *q;
4695 int i, j;
4696 PyUnicodeObject *u;
4697 int tabsize = 8;
4698
4699 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4700 return NULL;
4701
Thomas Wouters7e474022000-07-16 12:04:32 +00004702 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703 i = j = 0;
4704 e = self->str + self->length;
4705 for (p = self->str; p < e; p++)
4706 if (*p == '\t') {
4707 if (tabsize > 0)
4708 j += tabsize - (j % tabsize);
4709 }
4710 else {
4711 j++;
4712 if (*p == '\n' || *p == '\r') {
4713 i += j;
4714 j = 0;
4715 }
4716 }
4717
4718 /* Second pass: create output string and fill it */
4719 u = _PyUnicode_New(i + j);
4720 if (!u)
4721 return NULL;
4722
4723 j = 0;
4724 q = u->str;
4725
4726 for (p = self->str; p < e; p++)
4727 if (*p == '\t') {
4728 if (tabsize > 0) {
4729 i = tabsize - (j % tabsize);
4730 j += i;
4731 while (i--)
4732 *q++ = ' ';
4733 }
4734 }
4735 else {
4736 j++;
4737 *q++ = *p;
4738 if (*p == '\n' || *p == '\r')
4739 j = 0;
4740 }
4741
4742 return (PyObject*) u;
4743}
4744
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004745PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746"S.find(sub [,start [,end]]) -> int\n\
4747\n\
4748Return the lowest index in S where substring sub is found,\n\
4749such that sub is contained within s[start,end]. Optional\n\
4750arguments start and end are interpreted as in slice notation.\n\
4751\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004752Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753
4754static PyObject *
4755unicode_find(PyUnicodeObject *self, PyObject *args)
4756{
4757 PyUnicodeObject *substring;
4758 int start = 0;
4759 int end = INT_MAX;
4760 PyObject *result;
4761
Guido van Rossumb8872e62000-05-09 14:14:27 +00004762 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4763 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764 return NULL;
4765 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4766 (PyObject *)substring);
4767 if (substring == NULL)
4768 return NULL;
4769
4770 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4771
4772 Py_DECREF(substring);
4773 return result;
4774}
4775
4776static PyObject *
4777unicode_getitem(PyUnicodeObject *self, int index)
4778{
4779 if (index < 0 || index >= self->length) {
4780 PyErr_SetString(PyExc_IndexError, "string index out of range");
4781 return NULL;
4782 }
4783
4784 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4785}
4786
4787static long
4788unicode_hash(PyUnicodeObject *self)
4789{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004790 /* Since Unicode objects compare equal to their ASCII string
4791 counterparts, they should use the individual character values
4792 as basis for their hash value. This is needed to assure that
4793 strings and Unicode objects behave in the same way as
4794 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004795
Fredrik Lundhdde61642000-07-10 18:27:47 +00004796 register int len;
4797 register Py_UNICODE *p;
4798 register long x;
4799
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800 if (self->hash != -1)
4801 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004802 len = PyUnicode_GET_SIZE(self);
4803 p = PyUnicode_AS_UNICODE(self);
4804 x = *p << 7;
4805 while (--len >= 0)
4806 x = (1000003*x) ^ *p++;
4807 x ^= PyUnicode_GET_SIZE(self);
4808 if (x == -1)
4809 x = -2;
4810 self->hash = x;
4811 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812}
4813
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004814PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815"S.index(sub [,start [,end]]) -> int\n\
4816\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004817Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818
4819static PyObject *
4820unicode_index(PyUnicodeObject *self, PyObject *args)
4821{
4822 int result;
4823 PyUnicodeObject *substring;
4824 int start = 0;
4825 int end = INT_MAX;
4826
Guido van Rossumb8872e62000-05-09 14:14:27 +00004827 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4828 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 return NULL;
4830
4831 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4832 (PyObject *)substring);
4833 if (substring == NULL)
4834 return NULL;
4835
4836 result = findstring(self, substring, start, end, 1);
4837
4838 Py_DECREF(substring);
4839 if (result < 0) {
4840 PyErr_SetString(PyExc_ValueError, "substring not found");
4841 return NULL;
4842 }
4843 return PyInt_FromLong(result);
4844}
4845
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004846PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004847"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004849Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004850at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851
4852static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004853unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854{
4855 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4856 register const Py_UNICODE *e;
4857 int cased;
4858
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859 /* Shortcut for single character strings */
4860 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004861 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004863 /* Special case for empty strings */
4864 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004865 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004866
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867 e = p + PyUnicode_GET_SIZE(self);
4868 cased = 0;
4869 for (; p < e; p++) {
4870 register const Py_UNICODE ch = *p;
4871
4872 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004873 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874 else if (!cased && Py_UNICODE_ISLOWER(ch))
4875 cased = 1;
4876 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004877 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878}
4879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004880PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004881"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004883Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004884at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885
4886static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004887unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888{
4889 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4890 register const Py_UNICODE *e;
4891 int cased;
4892
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893 /* Shortcut for single character strings */
4894 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004895 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004897 /* Special case for empty strings */
4898 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004899 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004900
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 e = p + PyUnicode_GET_SIZE(self);
4902 cased = 0;
4903 for (; p < e; p++) {
4904 register const Py_UNICODE ch = *p;
4905
4906 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004907 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908 else if (!cased && Py_UNICODE_ISUPPER(ch))
4909 cased = 1;
4910 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004911 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912}
4913
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004914PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004915"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004917Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4918characters may only follow uncased characters and lowercase characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004919only cased ones. Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920
4921static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004922unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923{
4924 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4925 register const Py_UNICODE *e;
4926 int cased, previous_is_cased;
4927
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928 /* Shortcut for single character strings */
4929 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004930 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4931 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004933 /* Special case for empty strings */
4934 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004935 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004936
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937 e = p + PyUnicode_GET_SIZE(self);
4938 cased = 0;
4939 previous_is_cased = 0;
4940 for (; p < e; p++) {
4941 register const Py_UNICODE ch = *p;
4942
4943 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4944 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004945 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946 previous_is_cased = 1;
4947 cased = 1;
4948 }
4949 else if (Py_UNICODE_ISLOWER(ch)) {
4950 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004951 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952 previous_is_cased = 1;
4953 cased = 1;
4954 }
4955 else
4956 previous_is_cased = 0;
4957 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004958 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959}
4960
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004961PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004962"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004964Return True if there are only whitespace characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004965False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966
4967static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004968unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004969{
4970 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4971 register const Py_UNICODE *e;
4972
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973 /* Shortcut for single character strings */
4974 if (PyUnicode_GET_SIZE(self) == 1 &&
4975 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004976 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004978 /* Special case for empty strings */
4979 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004980 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004981
Guido van Rossumd57fd912000-03-10 22:53:23 +00004982 e = p + PyUnicode_GET_SIZE(self);
4983 for (; p < e; p++) {
4984 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004985 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004987 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988}
4989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004990PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004991"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004992\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004993Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004994and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004995
4996static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004997unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004998{
4999 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5000 register const Py_UNICODE *e;
5001
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005002 /* Shortcut for single character strings */
5003 if (PyUnicode_GET_SIZE(self) == 1 &&
5004 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005005 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005006
5007 /* Special case for empty strings */
5008 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005009 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005010
5011 e = p + PyUnicode_GET_SIZE(self);
5012 for (; p < e; p++) {
5013 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005014 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005015 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005016 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005017}
5018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005019PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005020"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005021\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005022Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005023and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005024
5025static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005026unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005027{
5028 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5029 register const Py_UNICODE *e;
5030
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005031 /* Shortcut for single character strings */
5032 if (PyUnicode_GET_SIZE(self) == 1 &&
5033 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005034 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005035
5036 /* Special case for empty strings */
5037 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005038 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005039
5040 e = p + PyUnicode_GET_SIZE(self);
5041 for (; p < e; p++) {
5042 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005043 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005044 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005045 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005046}
5047
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005048PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005049"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005051Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005052False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053
5054static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005055unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005056{
5057 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5058 register const Py_UNICODE *e;
5059
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060 /* Shortcut for single character strings */
5061 if (PyUnicode_GET_SIZE(self) == 1 &&
5062 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005063 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005065 /* Special case for empty strings */
5066 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005067 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005068
Guido van Rossumd57fd912000-03-10 22:53:23 +00005069 e = p + PyUnicode_GET_SIZE(self);
5070 for (; p < e; p++) {
5071 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005072 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005074 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075}
5076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005077PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005078"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005080Return True if there are only digit characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005081False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082
5083static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005084unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085{
5086 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5087 register const Py_UNICODE *e;
5088
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089 /* Shortcut for single character strings */
5090 if (PyUnicode_GET_SIZE(self) == 1 &&
5091 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005092 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005093
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005094 /* Special case for empty strings */
5095 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005096 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005097
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 e = p + PyUnicode_GET_SIZE(self);
5099 for (; p < e; p++) {
5100 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005101 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005103 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104}
5105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005106PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005107"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005109Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005110False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111
5112static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005113unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114{
5115 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5116 register const Py_UNICODE *e;
5117
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118 /* Shortcut for single character strings */
5119 if (PyUnicode_GET_SIZE(self) == 1 &&
5120 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005121 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005123 /* Special case for empty strings */
5124 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005125 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005126
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127 e = p + PyUnicode_GET_SIZE(self);
5128 for (; p < e; p++) {
5129 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005130 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005132 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133}
5134
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005135PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136"S.join(sequence) -> unicode\n\
5137\n\
5138Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005139sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140
5141static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005142unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005144 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145}
5146
5147static int
5148unicode_length(PyUnicodeObject *self)
5149{
5150 return self->length;
5151}
5152
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005153PyDoc_STRVAR(ljust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154"S.ljust(width) -> unicode\n\
5155\n\
5156Return S left justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005157done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158
5159static PyObject *
5160unicode_ljust(PyUnicodeObject *self, PyObject *args)
5161{
5162 int width;
5163 if (!PyArg_ParseTuple(args, "i:ljust", &width))
5164 return NULL;
5165
Tim Peters7a29bd52001-09-12 03:03:31 +00005166 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167 Py_INCREF(self);
5168 return (PyObject*) self;
5169 }
5170
5171 return (PyObject*) pad(self, 0, width - self->length, ' ');
5172}
5173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005174PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175"S.lower() -> unicode\n\
5176\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005177Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178
5179static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005180unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 return fixup(self, fixlower);
5183}
5184
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005185#define LEFTSTRIP 0
5186#define RIGHTSTRIP 1
5187#define BOTHSTRIP 2
5188
5189/* Arrays indexed by above */
5190static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5191
5192#define STRIPNAME(i) (stripformat[i]+3)
5193
5194static const Py_UNICODE *
5195unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5196{
Tim Peters030a5ce2002-04-22 19:00:10 +00005197 size_t i;
5198 for (i = 0; i < n; ++i)
5199 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005200 return s+i;
5201 return NULL;
5202}
5203
5204/* externally visible for str.strip(unicode) */
5205PyObject *
5206_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5207{
5208 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5209 int len = PyUnicode_GET_SIZE(self);
5210 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5211 int seplen = PyUnicode_GET_SIZE(sepobj);
5212 int i, j;
5213
5214 i = 0;
5215 if (striptype != RIGHTSTRIP) {
5216 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5217 i++;
5218 }
5219 }
5220
5221 j = len;
5222 if (striptype != LEFTSTRIP) {
5223 do {
5224 j--;
5225 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5226 j++;
5227 }
5228
5229 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5230 Py_INCREF(self);
5231 return (PyObject*)self;
5232 }
5233 else
5234 return PyUnicode_FromUnicode(s+i, j-i);
5235}
5236
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237
5238static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005239do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005241 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5242 int len = PyUnicode_GET_SIZE(self), i, j;
5243
5244 i = 0;
5245 if (striptype != RIGHTSTRIP) {
5246 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5247 i++;
5248 }
5249 }
5250
5251 j = len;
5252 if (striptype != LEFTSTRIP) {
5253 do {
5254 j--;
5255 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5256 j++;
5257 }
5258
5259 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5260 Py_INCREF(self);
5261 return (PyObject*)self;
5262 }
5263 else
5264 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265}
5266
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005267
5268static PyObject *
5269do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5270{
5271 PyObject *sep = NULL;
5272
5273 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5274 return NULL;
5275
5276 if (sep != NULL && sep != Py_None) {
5277 if (PyUnicode_Check(sep))
5278 return _PyUnicode_XStrip(self, striptype, sep);
5279 else if (PyString_Check(sep)) {
5280 PyObject *res;
5281 sep = PyUnicode_FromObject(sep);
5282 if (sep==NULL)
5283 return NULL;
5284 res = _PyUnicode_XStrip(self, striptype, sep);
5285 Py_DECREF(sep);
5286 return res;
5287 }
5288 else {
5289 PyErr_Format(PyExc_TypeError,
5290 "%s arg must be None, unicode or str",
5291 STRIPNAME(striptype));
5292 return NULL;
5293 }
5294 }
5295
5296 return do_strip(self, striptype);
5297}
5298
5299
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005300PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005301"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005302\n\
5303Return a copy of the string S with leading and trailing\n\
5304whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005305If chars is given and not None, remove characters in chars instead.\n\
5306If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005307
5308static PyObject *
5309unicode_strip(PyUnicodeObject *self, PyObject *args)
5310{
5311 if (PyTuple_GET_SIZE(args) == 0)
5312 return do_strip(self, BOTHSTRIP); /* Common case */
5313 else
5314 return do_argstrip(self, BOTHSTRIP, args);
5315}
5316
5317
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005318PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005319"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005320\n\
5321Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005322If chars is given and not None, remove characters in chars instead.\n\
5323If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005324
5325static PyObject *
5326unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5327{
5328 if (PyTuple_GET_SIZE(args) == 0)
5329 return do_strip(self, LEFTSTRIP); /* Common case */
5330 else
5331 return do_argstrip(self, LEFTSTRIP, args);
5332}
5333
5334
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005335PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005336"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005337\n\
5338Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005339If chars is given and not None, remove characters in chars instead.\n\
5340If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005341
5342static PyObject *
5343unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5344{
5345 if (PyTuple_GET_SIZE(args) == 0)
5346 return do_strip(self, RIGHTSTRIP); /* Common case */
5347 else
5348 return do_argstrip(self, RIGHTSTRIP, args);
5349}
5350
5351
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352static PyObject*
5353unicode_repeat(PyUnicodeObject *str, int len)
5354{
5355 PyUnicodeObject *u;
5356 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005357 int nchars;
5358 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359
5360 if (len < 0)
5361 len = 0;
5362
Tim Peters7a29bd52001-09-12 03:03:31 +00005363 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364 /* no repeat, return original string */
5365 Py_INCREF(str);
5366 return (PyObject*) str;
5367 }
Tim Peters8f422462000-09-09 06:13:41 +00005368
5369 /* ensure # of chars needed doesn't overflow int and # of bytes
5370 * needed doesn't overflow size_t
5371 */
5372 nchars = len * str->length;
5373 if (len && nchars / len != str->length) {
5374 PyErr_SetString(PyExc_OverflowError,
5375 "repeated string is too long");
5376 return NULL;
5377 }
5378 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5379 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5380 PyErr_SetString(PyExc_OverflowError,
5381 "repeated string is too long");
5382 return NULL;
5383 }
5384 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 if (!u)
5386 return NULL;
5387
5388 p = u->str;
5389
5390 while (len-- > 0) {
5391 Py_UNICODE_COPY(p, str->str, str->length);
5392 p += str->length;
5393 }
5394
5395 return (PyObject*) u;
5396}
5397
5398PyObject *PyUnicode_Replace(PyObject *obj,
5399 PyObject *subobj,
5400 PyObject *replobj,
5401 int maxcount)
5402{
5403 PyObject *self;
5404 PyObject *str1;
5405 PyObject *str2;
5406 PyObject *result;
5407
5408 self = PyUnicode_FromObject(obj);
5409 if (self == NULL)
5410 return NULL;
5411 str1 = PyUnicode_FromObject(subobj);
5412 if (str1 == NULL) {
5413 Py_DECREF(self);
5414 return NULL;
5415 }
5416 str2 = PyUnicode_FromObject(replobj);
5417 if (str2 == NULL) {
5418 Py_DECREF(self);
5419 Py_DECREF(str1);
5420 return NULL;
5421 }
5422 result = replace((PyUnicodeObject *)self,
5423 (PyUnicodeObject *)str1,
5424 (PyUnicodeObject *)str2,
5425 maxcount);
5426 Py_DECREF(self);
5427 Py_DECREF(str1);
5428 Py_DECREF(str2);
5429 return result;
5430}
5431
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005432PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433"S.replace (old, new[, maxsplit]) -> unicode\n\
5434\n\
5435Return a copy of S with all occurrences of substring\n\
5436old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005437given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438
5439static PyObject*
5440unicode_replace(PyUnicodeObject *self, PyObject *args)
5441{
5442 PyUnicodeObject *str1;
5443 PyUnicodeObject *str2;
5444 int maxcount = -1;
5445 PyObject *result;
5446
5447 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5448 return NULL;
5449 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5450 if (str1 == NULL)
5451 return NULL;
5452 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005453 if (str2 == NULL) {
5454 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005456 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457
5458 result = replace(self, str1, str2, maxcount);
5459
5460 Py_DECREF(str1);
5461 Py_DECREF(str2);
5462 return result;
5463}
5464
5465static
5466PyObject *unicode_repr(PyObject *unicode)
5467{
5468 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5469 PyUnicode_GET_SIZE(unicode),
5470 1);
5471}
5472
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005473PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474"S.rfind(sub [,start [,end]]) -> int\n\
5475\n\
5476Return the highest index in S where substring sub is found,\n\
5477such that sub is contained within s[start,end]. Optional\n\
5478arguments start and end are interpreted as in slice notation.\n\
5479\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005480Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481
5482static PyObject *
5483unicode_rfind(PyUnicodeObject *self, PyObject *args)
5484{
5485 PyUnicodeObject *substring;
5486 int start = 0;
5487 int end = INT_MAX;
5488 PyObject *result;
5489
Guido van Rossumb8872e62000-05-09 14:14:27 +00005490 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5491 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 return NULL;
5493 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5494 (PyObject *)substring);
5495 if (substring == NULL)
5496 return NULL;
5497
5498 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5499
5500 Py_DECREF(substring);
5501 return result;
5502}
5503
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005504PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505"S.rindex(sub [,start [,end]]) -> int\n\
5506\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005507Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508
5509static PyObject *
5510unicode_rindex(PyUnicodeObject *self, PyObject *args)
5511{
5512 int result;
5513 PyUnicodeObject *substring;
5514 int start = 0;
5515 int end = INT_MAX;
5516
Guido van Rossumb8872e62000-05-09 14:14:27 +00005517 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5518 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519 return NULL;
5520 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5521 (PyObject *)substring);
5522 if (substring == NULL)
5523 return NULL;
5524
5525 result = findstring(self, substring, start, end, -1);
5526
5527 Py_DECREF(substring);
5528 if (result < 0) {
5529 PyErr_SetString(PyExc_ValueError, "substring not found");
5530 return NULL;
5531 }
5532 return PyInt_FromLong(result);
5533}
5534
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005535PyDoc_STRVAR(rjust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536"S.rjust(width) -> unicode\n\
5537\n\
5538Return S right justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005539done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540
5541static PyObject *
5542unicode_rjust(PyUnicodeObject *self, PyObject *args)
5543{
5544 int width;
5545 if (!PyArg_ParseTuple(args, "i:rjust", &width))
5546 return NULL;
5547
Tim Peters7a29bd52001-09-12 03:03:31 +00005548 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 Py_INCREF(self);
5550 return (PyObject*) self;
5551 }
5552
5553 return (PyObject*) pad(self, width - self->length, 0, ' ');
5554}
5555
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556static PyObject*
5557unicode_slice(PyUnicodeObject *self, int start, int end)
5558{
5559 /* standard clamping */
5560 if (start < 0)
5561 start = 0;
5562 if (end < 0)
5563 end = 0;
5564 if (end > self->length)
5565 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005566 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567 /* full slice, return original string */
5568 Py_INCREF(self);
5569 return (PyObject*) self;
5570 }
5571 if (start > end)
5572 start = end;
5573 /* copy slice */
5574 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5575 end - start);
5576}
5577
5578PyObject *PyUnicode_Split(PyObject *s,
5579 PyObject *sep,
5580 int maxsplit)
5581{
5582 PyObject *result;
5583
5584 s = PyUnicode_FromObject(s);
5585 if (s == NULL)
5586 return NULL;
5587 if (sep != NULL) {
5588 sep = PyUnicode_FromObject(sep);
5589 if (sep == NULL) {
5590 Py_DECREF(s);
5591 return NULL;
5592 }
5593 }
5594
5595 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5596
5597 Py_DECREF(s);
5598 Py_XDECREF(sep);
5599 return result;
5600}
5601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005602PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603"S.split([sep [,maxsplit]]) -> list of strings\n\
5604\n\
5605Return a list of the words in S, using sep as the\n\
5606delimiter string. If maxsplit is given, at most maxsplit\n\
5607splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005608is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609
5610static PyObject*
5611unicode_split(PyUnicodeObject *self, PyObject *args)
5612{
5613 PyObject *substring = Py_None;
5614 int maxcount = -1;
5615
5616 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5617 return NULL;
5618
5619 if (substring == Py_None)
5620 return split(self, NULL, maxcount);
5621 else if (PyUnicode_Check(substring))
5622 return split(self, (PyUnicodeObject *)substring, maxcount);
5623 else
5624 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5625}
5626
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005627PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00005628"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629\n\
5630Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00005631Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005632is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633
5634static PyObject*
5635unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5636{
Guido van Rossum86662912000-04-11 15:38:46 +00005637 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638
Guido van Rossum86662912000-04-11 15:38:46 +00005639 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 return NULL;
5641
Guido van Rossum86662912000-04-11 15:38:46 +00005642 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643}
5644
5645static
5646PyObject *unicode_str(PyUnicodeObject *self)
5647{
Fred Drakee4315f52000-05-09 19:53:39 +00005648 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649}
5650
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005651PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652"S.swapcase() -> unicode\n\
5653\n\
5654Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005655and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656
5657static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005658unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660 return fixup(self, fixswapcase);
5661}
5662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005663PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664"S.translate(table) -> unicode\n\
5665\n\
5666Return a copy of the string S, where all characters have been mapped\n\
5667through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00005668Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5669Unmapped characters are left untouched. Characters mapped to None\n\
5670are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671
5672static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005673unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 return PyUnicode_TranslateCharmap(self->str,
5676 self->length,
5677 table,
5678 "ignore");
5679}
5680
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005681PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682"S.upper() -> unicode\n\
5683\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005684Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685
5686static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005687unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 return fixup(self, fixupper);
5690}
5691
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005692PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693"S.zfill(width) -> unicode\n\
5694\n\
5695Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005696of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697
5698static PyObject *
5699unicode_zfill(PyUnicodeObject *self, PyObject *args)
5700{
5701 int fill;
5702 PyUnicodeObject *u;
5703
5704 int width;
5705 if (!PyArg_ParseTuple(args, "i:zfill", &width))
5706 return NULL;
5707
5708 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00005709 if (PyUnicode_CheckExact(self)) {
5710 Py_INCREF(self);
5711 return (PyObject*) self;
5712 }
5713 else
5714 return PyUnicode_FromUnicode(
5715 PyUnicode_AS_UNICODE(self),
5716 PyUnicode_GET_SIZE(self)
5717 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718 }
5719
5720 fill = width - self->length;
5721
5722 u = pad(self, fill, 0, '0');
5723
Walter Dörwald068325e2002-04-15 13:36:47 +00005724 if (u == NULL)
5725 return NULL;
5726
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 if (u->str[fill] == '+' || u->str[fill] == '-') {
5728 /* move sign to beginning of string */
5729 u->str[0] = u->str[fill];
5730 u->str[fill] = '0';
5731 }
5732
5733 return (PyObject*) u;
5734}
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735
5736#if 0
5737static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005738unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 return PyInt_FromLong(unicode_freelist_size);
5741}
5742#endif
5743
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005744PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005745"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00005747Return True if S starts with the specified prefix, False otherwise.\n\
5748With optional start, test S beginning at that position.\n\
5749With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750
5751static PyObject *
5752unicode_startswith(PyUnicodeObject *self,
5753 PyObject *args)
5754{
5755 PyUnicodeObject *substring;
5756 int start = 0;
5757 int end = INT_MAX;
5758 PyObject *result;
5759
Guido van Rossumb8872e62000-05-09 14:14:27 +00005760 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5761 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762 return NULL;
5763 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5764 (PyObject *)substring);
5765 if (substring == NULL)
5766 return NULL;
5767
Guido van Rossum77f6a652002-04-03 22:41:51 +00005768 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769
5770 Py_DECREF(substring);
5771 return result;
5772}
5773
5774
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005775PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005776"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00005778Return True if S ends with the specified suffix, False otherwise.\n\
5779With optional start, test S beginning at that position.\n\
5780With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781
5782static PyObject *
5783unicode_endswith(PyUnicodeObject *self,
5784 PyObject *args)
5785{
5786 PyUnicodeObject *substring;
5787 int start = 0;
5788 int end = INT_MAX;
5789 PyObject *result;
5790
Guido van Rossumb8872e62000-05-09 14:14:27 +00005791 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5792 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793 return NULL;
5794 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5795 (PyObject *)substring);
5796 if (substring == NULL)
5797 return NULL;
5798
Guido van Rossum77f6a652002-04-03 22:41:51 +00005799 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800
5801 Py_DECREF(substring);
5802 return result;
5803}
5804
5805
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005806
5807static PyObject *
5808unicode_getnewargs(PyUnicodeObject *v)
5809{
5810 return Py_BuildValue("(u#)", v->str, v->length);
5811}
5812
5813
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814static PyMethodDef unicode_methods[] = {
5815
5816 /* Order is according to common usage: often used methods should
5817 appear first, since lookup is done sequentially. */
5818
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005819 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5820 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5821 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5822 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5823 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5824 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5825 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5826 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5827 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5828 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5829 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5830 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5831 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005832 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005833/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5834 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5835 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5836 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005837 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005838 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005839 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005840 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5841 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5842 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5843 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5844 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5845 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5846 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5847 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5848 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5849 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5850 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5851 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5852 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5853 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005854 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00005855#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005856 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857#endif
5858
5859#if 0
5860 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005861 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862#endif
5863
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005864 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 {NULL, NULL}
5866};
5867
Neil Schemenauerce30bc92002-11-18 16:10:18 +00005868static PyObject *
5869unicode_mod(PyObject *v, PyObject *w)
5870{
5871 if (!PyUnicode_Check(v)) {
5872 Py_INCREF(Py_NotImplemented);
5873 return Py_NotImplemented;
5874 }
5875 return PyUnicode_Format(v, w);
5876}
5877
5878static PyNumberMethods unicode_as_number = {
5879 0, /*nb_add*/
5880 0, /*nb_subtract*/
5881 0, /*nb_multiply*/
5882 0, /*nb_divide*/
5883 unicode_mod, /*nb_remainder*/
5884};
5885
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886static PySequenceMethods unicode_as_sequence = {
5887 (inquiry) unicode_length, /* sq_length */
5888 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5889 (intargfunc) unicode_repeat, /* sq_repeat */
5890 (intargfunc) unicode_getitem, /* sq_item */
5891 (intintargfunc) unicode_slice, /* sq_slice */
5892 0, /* sq_ass_item */
5893 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005894 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895};
5896
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005897static PyObject*
5898unicode_subscript(PyUnicodeObject* self, PyObject* item)
5899{
5900 if (PyInt_Check(item)) {
5901 long i = PyInt_AS_LONG(item);
5902 if (i < 0)
5903 i += PyString_GET_SIZE(self);
5904 return unicode_getitem(self, i);
5905 } else if (PyLong_Check(item)) {
5906 long i = PyLong_AsLong(item);
5907 if (i == -1 && PyErr_Occurred())
5908 return NULL;
5909 if (i < 0)
5910 i += PyString_GET_SIZE(self);
5911 return unicode_getitem(self, i);
5912 } else if (PySlice_Check(item)) {
5913 int start, stop, step, slicelength, cur, i;
5914 Py_UNICODE* source_buf;
5915 Py_UNICODE* result_buf;
5916 PyObject* result;
5917
5918 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5919 &start, &stop, &step, &slicelength) < 0) {
5920 return NULL;
5921 }
5922
5923 if (slicelength <= 0) {
5924 return PyUnicode_FromUnicode(NULL, 0);
5925 } else {
5926 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5927 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5928
5929 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5930 result_buf[i] = source_buf[cur];
5931 }
5932
5933 result = PyUnicode_FromUnicode(result_buf, slicelength);
5934 PyMem_FREE(result_buf);
5935 return result;
5936 }
5937 } else {
5938 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5939 return NULL;
5940 }
5941}
5942
5943static PyMappingMethods unicode_as_mapping = {
5944 (inquiry)unicode_length, /* mp_length */
5945 (binaryfunc)unicode_subscript, /* mp_subscript */
5946 (objobjargproc)0, /* mp_ass_subscript */
5947};
5948
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949static int
5950unicode_buffer_getreadbuf(PyUnicodeObject *self,
5951 int index,
5952 const void **ptr)
5953{
5954 if (index != 0) {
5955 PyErr_SetString(PyExc_SystemError,
5956 "accessing non-existent unicode segment");
5957 return -1;
5958 }
5959 *ptr = (void *) self->str;
5960 return PyUnicode_GET_DATA_SIZE(self);
5961}
5962
5963static int
5964unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5965 const void **ptr)
5966{
5967 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00005968 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 return -1;
5970}
5971
5972static int
5973unicode_buffer_getsegcount(PyUnicodeObject *self,
5974 int *lenp)
5975{
5976 if (lenp)
5977 *lenp = PyUnicode_GET_DATA_SIZE(self);
5978 return 1;
5979}
5980
5981static int
5982unicode_buffer_getcharbuf(PyUnicodeObject *self,
5983 int index,
5984 const void **ptr)
5985{
5986 PyObject *str;
5987
5988 if (index != 0) {
5989 PyErr_SetString(PyExc_SystemError,
5990 "accessing non-existent unicode segment");
5991 return -1;
5992 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005993 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 if (str == NULL)
5995 return -1;
5996 *ptr = (void *) PyString_AS_STRING(str);
5997 return PyString_GET_SIZE(str);
5998}
5999
6000/* Helpers for PyUnicode_Format() */
6001
6002static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006003getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004{
6005 int argidx = *p_argidx;
6006 if (argidx < arglen) {
6007 (*p_argidx)++;
6008 if (arglen < 0)
6009 return args;
6010 else
6011 return PyTuple_GetItem(args, argidx);
6012 }
6013 PyErr_SetString(PyExc_TypeError,
6014 "not enough arguments for format string");
6015 return NULL;
6016}
6017
6018#define F_LJUST (1<<0)
6019#define F_SIGN (1<<1)
6020#define F_BLANK (1<<2)
6021#define F_ALT (1<<3)
6022#define F_ZERO (1<<4)
6023
6024static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026{
6027 register int i;
6028 int len;
6029 va_list va;
6030 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032
6033 /* First, format the string as char array, then expand to Py_UNICODE
6034 array. */
6035 charbuffer = (char *)buffer;
6036 len = vsprintf(charbuffer, format, va);
6037 for (i = len - 1; i >= 0; i--)
6038 buffer[i] = (Py_UNICODE) charbuffer[i];
6039
6040 va_end(va);
6041 return len;
6042}
6043
Guido van Rossum078151d2002-08-11 04:24:12 +00006044/* XXX To save some code duplication, formatfloat/long/int could have been
6045 shared with stringobject.c, converting from 8-bit to Unicode after the
6046 formatting is done. */
6047
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048static int
6049formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006050 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 int flags,
6052 int prec,
6053 int type,
6054 PyObject *v)
6055{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006056 /* fmt = '%#.' + `prec` + `type`
6057 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058 char fmt[20];
6059 double x;
6060
6061 x = PyFloat_AsDouble(v);
6062 if (x == -1.0 && PyErr_Occurred())
6063 return -1;
6064 if (prec < 0)
6065 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6067 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006068 /* Worst case length calc to ensure no buffer overrun:
6069
6070 'g' formats:
6071 fmt = %#.<prec>g
6072 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6073 for any double rep.)
6074 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6075
6076 'f' formats:
6077 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6078 len = 1 + 50 + 1 + prec = 52 + prec
6079
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006080 If prec=0 the effective precision is 1 (the leading digit is
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006081 always given), therefore increase the length by one.
6082
6083 */
6084 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6085 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006086 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006087 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006088 return -1;
6089 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006090 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6091 (flags&F_ALT) ? "#" : "",
6092 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093 return usprintf(buf, fmt, x);
6094}
6095
Tim Peters38fd5b62000-09-21 05:43:11 +00006096static PyObject*
6097formatlong(PyObject *val, int flags, int prec, int type)
6098{
6099 char *buf;
6100 int i, len;
6101 PyObject *str; /* temporary string object. */
6102 PyUnicodeObject *result;
6103
6104 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6105 if (!str)
6106 return NULL;
6107 result = _PyUnicode_New(len);
6108 for (i = 0; i < len; i++)
6109 result->str[i] = buf[i];
6110 result->str[len] = 0;
6111 Py_DECREF(str);
6112 return (PyObject*)result;
6113}
6114
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115static int
6116formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006117 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 int flags,
6119 int prec,
6120 int type,
6121 PyObject *v)
6122{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006123 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006124 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6125 * + 1 + 1
6126 * = 24
6127 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006128 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 long x;
6130
6131 x = PyInt_AsLong(v);
6132 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006133 return -1;
Guido van Rossum078151d2002-08-11 04:24:12 +00006134 if (x < 0 && type != 'd' && type != 'i') {
Guido van Rossum54df53a2002-08-14 18:38:27 +00006135 if (PyErr_Warn(PyExc_FutureWarning,
Guido van Rossum078151d2002-08-11 04:24:12 +00006136 "%u/%o/%x/%X of negative int will return "
6137 "a signed string in Python 2.4 and up") < 0)
6138 return -1;
6139 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006141 prec = 1;
6142
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006143 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006144 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
6145 */
6146 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006147 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006148 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006149 return -1;
6150 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006151
6152 if ((flags & F_ALT) &&
6153 (type == 'x' || type == 'X')) {
6154 /* When converting under %#x or %#X, there are a number
6155 * of issues that cause pain:
6156 * - when 0 is being converted, the C standard leaves off
6157 * the '0x' or '0X', which is inconsistent with other
6158 * %#x/%#X conversions and inconsistent with Python's
6159 * hex() function
6160 * - there are platforms that violate the standard and
6161 * convert 0 with the '0x' or '0X'
6162 * (Metrowerks, Compaq Tru64)
6163 * - there are platforms that give '0x' when converting
6164 * under %#X, but convert 0 in accordance with the
6165 * standard (OS/2 EMX)
6166 *
6167 * We can achieve the desired consistency by inserting our
6168 * own '0x' or '0X' prefix, and substituting %x/%X in place
6169 * of %#x/%#X.
6170 *
6171 * Note that this is the same approach as used in
6172 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006173 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006174 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
6175 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006176 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006177 else {
6178 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
6179 (flags&F_ALT) ? "#" : "",
6180 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006181 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 return usprintf(buf, fmt, x);
6183}
6184
6185static int
6186formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006187 size_t buflen,
6188 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006190 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006191 if (PyUnicode_Check(v)) {
6192 if (PyUnicode_GET_SIZE(v) != 1)
6193 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006195 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006197 else if (PyString_Check(v)) {
6198 if (PyString_GET_SIZE(v) != 1)
6199 goto onError;
6200 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202
6203 else {
6204 /* Integer input truncated to a character */
6205 long x;
6206 x = PyInt_AsLong(v);
6207 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006208 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006209#ifdef Py_UNICODE_WIDE
6210 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006211 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006212 "%c arg not in range(0x110000) "
6213 "(wide Python build)");
6214 return -1;
6215 }
6216#else
6217 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006218 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006219 "%c arg not in range(0x10000) "
6220 "(narrow Python build)");
6221 return -1;
6222 }
6223#endif
6224 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 }
6226 buf[1] = '\0';
6227 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006228
6229 onError:
6230 PyErr_SetString(PyExc_TypeError,
6231 "%c requires int or char");
6232 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233}
6234
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006235/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6236
6237 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6238 chars are formatted. XXX This is a magic number. Each formatting
6239 routine does bounds checking to ensure no overflow, but a better
6240 solution may be to malloc a buffer of appropriate size for each
6241 format. For now, the current solution is sufficient.
6242*/
6243#define FORMATBUFLEN (size_t)120
6244
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245PyObject *PyUnicode_Format(PyObject *format,
6246 PyObject *args)
6247{
6248 Py_UNICODE *fmt, *res;
6249 int fmtcnt, rescnt, reslen, arglen, argidx;
6250 int args_owned = 0;
6251 PyUnicodeObject *result = NULL;
6252 PyObject *dict = NULL;
6253 PyObject *uformat;
6254
6255 if (format == NULL || args == NULL) {
6256 PyErr_BadInternalCall();
6257 return NULL;
6258 }
6259 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006260 if (uformat == NULL)
6261 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 fmt = PyUnicode_AS_UNICODE(uformat);
6263 fmtcnt = PyUnicode_GET_SIZE(uformat);
6264
6265 reslen = rescnt = fmtcnt + 100;
6266 result = _PyUnicode_New(reslen);
6267 if (result == NULL)
6268 goto onError;
6269 res = PyUnicode_AS_UNICODE(result);
6270
6271 if (PyTuple_Check(args)) {
6272 arglen = PyTuple_Size(args);
6273 argidx = 0;
6274 }
6275 else {
6276 arglen = -1;
6277 argidx = -2;
6278 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006279 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6280 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 dict = args;
6282
6283 while (--fmtcnt >= 0) {
6284 if (*fmt != '%') {
6285 if (--rescnt < 0) {
6286 rescnt = fmtcnt + 100;
6287 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006288 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 return NULL;
6290 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6291 --rescnt;
6292 }
6293 *res++ = *fmt++;
6294 }
6295 else {
6296 /* Got a format specifier */
6297 int flags = 0;
6298 int width = -1;
6299 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 Py_UNICODE c = '\0';
6301 Py_UNICODE fill;
6302 PyObject *v = NULL;
6303 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006304 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 Py_UNICODE sign;
6306 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006307 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308
6309 fmt++;
6310 if (*fmt == '(') {
6311 Py_UNICODE *keystart;
6312 int keylen;
6313 PyObject *key;
6314 int pcount = 1;
6315
6316 if (dict == NULL) {
6317 PyErr_SetString(PyExc_TypeError,
6318 "format requires a mapping");
6319 goto onError;
6320 }
6321 ++fmt;
6322 --fmtcnt;
6323 keystart = fmt;
6324 /* Skip over balanced parentheses */
6325 while (pcount > 0 && --fmtcnt >= 0) {
6326 if (*fmt == ')')
6327 --pcount;
6328 else if (*fmt == '(')
6329 ++pcount;
6330 fmt++;
6331 }
6332 keylen = fmt - keystart - 1;
6333 if (fmtcnt < 0 || pcount > 0) {
6334 PyErr_SetString(PyExc_ValueError,
6335 "incomplete format key");
6336 goto onError;
6337 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006338#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006339 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340 then looked up since Python uses strings to hold
6341 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006342 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343 key = PyUnicode_EncodeUTF8(keystart,
6344 keylen,
6345 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006346#else
6347 key = PyUnicode_FromUnicode(keystart, keylen);
6348#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349 if (key == NULL)
6350 goto onError;
6351 if (args_owned) {
6352 Py_DECREF(args);
6353 args_owned = 0;
6354 }
6355 args = PyObject_GetItem(dict, key);
6356 Py_DECREF(key);
6357 if (args == NULL) {
6358 goto onError;
6359 }
6360 args_owned = 1;
6361 arglen = -1;
6362 argidx = -2;
6363 }
6364 while (--fmtcnt >= 0) {
6365 switch (c = *fmt++) {
6366 case '-': flags |= F_LJUST; continue;
6367 case '+': flags |= F_SIGN; continue;
6368 case ' ': flags |= F_BLANK; continue;
6369 case '#': flags |= F_ALT; continue;
6370 case '0': flags |= F_ZERO; continue;
6371 }
6372 break;
6373 }
6374 if (c == '*') {
6375 v = getnextarg(args, arglen, &argidx);
6376 if (v == NULL)
6377 goto onError;
6378 if (!PyInt_Check(v)) {
6379 PyErr_SetString(PyExc_TypeError,
6380 "* wants int");
6381 goto onError;
6382 }
6383 width = PyInt_AsLong(v);
6384 if (width < 0) {
6385 flags |= F_LJUST;
6386 width = -width;
6387 }
6388 if (--fmtcnt >= 0)
6389 c = *fmt++;
6390 }
6391 else if (c >= '0' && c <= '9') {
6392 width = c - '0';
6393 while (--fmtcnt >= 0) {
6394 c = *fmt++;
6395 if (c < '0' || c > '9')
6396 break;
6397 if ((width*10) / 10 != width) {
6398 PyErr_SetString(PyExc_ValueError,
6399 "width too big");
6400 goto onError;
6401 }
6402 width = width*10 + (c - '0');
6403 }
6404 }
6405 if (c == '.') {
6406 prec = 0;
6407 if (--fmtcnt >= 0)
6408 c = *fmt++;
6409 if (c == '*') {
6410 v = getnextarg(args, arglen, &argidx);
6411 if (v == NULL)
6412 goto onError;
6413 if (!PyInt_Check(v)) {
6414 PyErr_SetString(PyExc_TypeError,
6415 "* wants int");
6416 goto onError;
6417 }
6418 prec = PyInt_AsLong(v);
6419 if (prec < 0)
6420 prec = 0;
6421 if (--fmtcnt >= 0)
6422 c = *fmt++;
6423 }
6424 else if (c >= '0' && c <= '9') {
6425 prec = c - '0';
6426 while (--fmtcnt >= 0) {
6427 c = Py_CHARMASK(*fmt++);
6428 if (c < '0' || c > '9')
6429 break;
6430 if ((prec*10) / 10 != prec) {
6431 PyErr_SetString(PyExc_ValueError,
6432 "prec too big");
6433 goto onError;
6434 }
6435 prec = prec*10 + (c - '0');
6436 }
6437 }
6438 } /* prec */
6439 if (fmtcnt >= 0) {
6440 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 if (--fmtcnt >= 0)
6442 c = *fmt++;
6443 }
6444 }
6445 if (fmtcnt < 0) {
6446 PyErr_SetString(PyExc_ValueError,
6447 "incomplete format");
6448 goto onError;
6449 }
6450 if (c != '%') {
6451 v = getnextarg(args, arglen, &argidx);
6452 if (v == NULL)
6453 goto onError;
6454 }
6455 sign = 0;
6456 fill = ' ';
6457 switch (c) {
6458
6459 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006460 pbuf = formatbuf;
6461 /* presume that buffer length is at least 1 */
6462 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 len = 1;
6464 break;
6465
6466 case 's':
6467 case 'r':
6468 if (PyUnicode_Check(v) && c == 's') {
6469 temp = v;
6470 Py_INCREF(temp);
6471 }
6472 else {
6473 PyObject *unicode;
6474 if (c == 's')
6475 temp = PyObject_Str(v);
6476 else
6477 temp = PyObject_Repr(v);
6478 if (temp == NULL)
6479 goto onError;
6480 if (!PyString_Check(temp)) {
6481 /* XXX Note: this should never happen, since
6482 PyObject_Repr() and PyObject_Str() assure
6483 this */
6484 Py_DECREF(temp);
6485 PyErr_SetString(PyExc_TypeError,
6486 "%s argument has non-string str()");
6487 goto onError;
6488 }
Fred Drakee4315f52000-05-09 19:53:39 +00006489 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006491 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492 "strict");
6493 Py_DECREF(temp);
6494 temp = unicode;
6495 if (temp == NULL)
6496 goto onError;
6497 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006498 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 len = PyUnicode_GET_SIZE(temp);
6500 if (prec >= 0 && len > prec)
6501 len = prec;
6502 break;
6503
6504 case 'i':
6505 case 'd':
6506 case 'u':
6507 case 'o':
6508 case 'x':
6509 case 'X':
6510 if (c == 'i')
6511 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006512 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006513 temp = formatlong(v, flags, prec, c);
6514 if (!temp)
6515 goto onError;
6516 pbuf = PyUnicode_AS_UNICODE(temp);
6517 len = PyUnicode_GET_SIZE(temp);
6518 /* unbounded ints can always produce
6519 a sign character! */
6520 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006522 else {
6523 pbuf = formatbuf;
6524 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6525 flags, prec, c, v);
6526 if (len < 0)
6527 goto onError;
6528 /* only d conversion is signed */
6529 sign = c == 'd';
6530 }
6531 if (flags & F_ZERO)
6532 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533 break;
6534
6535 case 'e':
6536 case 'E':
6537 case 'f':
6538 case 'g':
6539 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006540 pbuf = formatbuf;
6541 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6542 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 if (len < 0)
6544 goto onError;
6545 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006546 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 fill = '0';
6548 break;
6549
6550 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006551 pbuf = formatbuf;
6552 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553 if (len < 0)
6554 goto onError;
6555 break;
6556
6557 default:
6558 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006559 "unsupported format character '%c' (0x%x) "
6560 "at index %i",
Neal Norwitza0378e12002-09-13 13:47:06 +00006561 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006562 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006563 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564 goto onError;
6565 }
6566 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006567 if (*pbuf == '-' || *pbuf == '+') {
6568 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 len--;
6570 }
6571 else if (flags & F_SIGN)
6572 sign = '+';
6573 else if (flags & F_BLANK)
6574 sign = ' ';
6575 else
6576 sign = 0;
6577 }
6578 if (width < len)
6579 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006580 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581 reslen -= rescnt;
6582 rescnt = width + fmtcnt + 100;
6583 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006584 if (reslen < 0) {
6585 Py_DECREF(result);
6586 return PyErr_NoMemory();
6587 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006588 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 return NULL;
6590 res = PyUnicode_AS_UNICODE(result)
6591 + reslen - rescnt;
6592 }
6593 if (sign) {
6594 if (fill != ' ')
6595 *res++ = sign;
6596 rescnt--;
6597 if (width > len)
6598 width--;
6599 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006600 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6601 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006602 assert(pbuf[1] == c);
6603 if (fill != ' ') {
6604 *res++ = *pbuf++;
6605 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00006606 }
Tim Petersfff53252001-04-12 18:38:48 +00006607 rescnt -= 2;
6608 width -= 2;
6609 if (width < 0)
6610 width = 0;
6611 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00006612 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 if (width > len && !(flags & F_LJUST)) {
6614 do {
6615 --rescnt;
6616 *res++ = fill;
6617 } while (--width > len);
6618 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006619 if (fill == ' ') {
6620 if (sign)
6621 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00006622 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006623 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006624 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00006625 *res++ = *pbuf++;
6626 *res++ = *pbuf++;
6627 }
6628 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006629 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630 res += len;
6631 rescnt -= len;
6632 while (--width >= len) {
6633 --rescnt;
6634 *res++ = ' ';
6635 }
6636 if (dict && (argidx < arglen) && c != '%') {
6637 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006638 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639 goto onError;
6640 }
6641 Py_XDECREF(temp);
6642 } /* '%' */
6643 } /* until end */
6644 if (argidx < arglen && !dict) {
6645 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006646 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 goto onError;
6648 }
6649
6650 if (args_owned) {
6651 Py_DECREF(args);
6652 }
6653 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006654 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006655 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656 return (PyObject *)result;
6657
6658 onError:
6659 Py_XDECREF(result);
6660 Py_DECREF(uformat);
6661 if (args_owned) {
6662 Py_DECREF(args);
6663 }
6664 return NULL;
6665}
6666
6667static PyBufferProcs unicode_as_buffer = {
6668 (getreadbufferproc) unicode_buffer_getreadbuf,
6669 (getwritebufferproc) unicode_buffer_getwritebuf,
6670 (getsegcountproc) unicode_buffer_getsegcount,
6671 (getcharbufferproc) unicode_buffer_getcharbuf,
6672};
6673
Jeremy Hylton938ace62002-07-17 16:30:39 +00006674static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00006675unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6676
Tim Peters6d6c1a32001-08-02 04:15:00 +00006677static PyObject *
6678unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6679{
6680 PyObject *x = NULL;
6681 static char *kwlist[] = {"string", "encoding", "errors", 0};
6682 char *encoding = NULL;
6683 char *errors = NULL;
6684
Guido van Rossume023fe02001-08-30 03:12:59 +00006685 if (type != &PyUnicode_Type)
6686 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00006687 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6688 kwlist, &x, &encoding, &errors))
6689 return NULL;
6690 if (x == NULL)
6691 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00006692 if (encoding == NULL && errors == NULL)
6693 return PyObject_Unicode(x);
6694 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00006695 return PyUnicode_FromEncodedObject(x, encoding, errors);
6696}
6697
Guido van Rossume023fe02001-08-30 03:12:59 +00006698static PyObject *
6699unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6700{
Tim Petersaf90b3e2001-09-12 05:18:58 +00006701 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006702 int n;
6703
6704 assert(PyType_IsSubtype(type, &PyUnicode_Type));
6705 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6706 if (tmp == NULL)
6707 return NULL;
6708 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00006709 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006710 if (pnew == NULL) {
6711 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00006712 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00006713 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006714 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6715 if (pnew->str == NULL) {
6716 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006717 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006718 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00006719 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00006720 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006721 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6722 pnew->length = n;
6723 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00006724 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00006725 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006726}
6727
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006728PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00006729"unicode(string [, encoding[, errors]]) -> object\n\
6730\n\
6731Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00006732encoding defaults to the current default string encoding.\n\
6733errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00006734
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735PyTypeObject PyUnicode_Type = {
6736 PyObject_HEAD_INIT(&PyType_Type)
6737 0, /* ob_size */
6738 "unicode", /* tp_name */
6739 sizeof(PyUnicodeObject), /* tp_size */
6740 0, /* tp_itemsize */
6741 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00006742 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006744 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745 0, /* tp_setattr */
6746 (cmpfunc) unicode_compare, /* tp_compare */
6747 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006748 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006750 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 (hashfunc) unicode_hash, /* tp_hash*/
6752 0, /* tp_call*/
6753 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006754 PyObject_GenericGetAttr, /* tp_getattro */
6755 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006757 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
6758 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006759 unicode_doc, /* tp_doc */
6760 0, /* tp_traverse */
6761 0, /* tp_clear */
6762 0, /* tp_richcompare */
6763 0, /* tp_weaklistoffset */
6764 0, /* tp_iter */
6765 0, /* tp_iternext */
6766 unicode_methods, /* tp_methods */
6767 0, /* tp_members */
6768 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00006769 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006770 0, /* tp_dict */
6771 0, /* tp_descr_get */
6772 0, /* tp_descr_set */
6773 0, /* tp_dictoffset */
6774 0, /* tp_init */
6775 0, /* tp_alloc */
6776 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006777 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778};
6779
6780/* Initialize the Unicode implementation */
6781
Thomas Wouters78890102000-07-22 19:25:51 +00006782void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006784 int i;
6785
Fred Drakee4315f52000-05-09 19:53:39 +00006786 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006787 unicode_freelist = NULL;
6788 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00006790 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006791 for (i = 0; i < 256; i++)
6792 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00006793 if (PyType_Ready(&PyUnicode_Type) < 0)
6794 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795}
6796
6797/* Finalize the Unicode implementation */
6798
6799void
Thomas Wouters78890102000-07-22 19:25:51 +00006800_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006802 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006803 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00006805 Py_XDECREF(unicode_empty);
6806 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006807
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006808 for (i = 0; i < 256; i++) {
6809 if (unicode_latin1[i]) {
6810 Py_DECREF(unicode_latin1[i]);
6811 unicode_latin1[i] = NULL;
6812 }
6813 }
6814
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006815 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816 PyUnicodeObject *v = u;
6817 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006818 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00006819 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006820 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006821 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006823 unicode_freelist = NULL;
6824 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006826
6827/*
6828Local variables:
6829c-basic-offset: 4
6830indent-tabs-mode: nil
6831End:
6832*/