blob: b167a1d723a88eca732cbbcedadbafea064981be [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000222 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000279 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000280 *unicode = (PyObject *)w;
281 return 0;
282 }
283
284 /* Note that we don't have to modify *unicode for unshared Unicode
285 objects, since we can modify them in-place. */
286 return unicode_resize(v, length);
287}
288
289/* Internal API for use in unicodeobject.c only ! */
290#define _PyUnicode_Resize(unicodevar, length) \
291 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
292
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
294 int size)
295{
296 PyUnicodeObject *unicode;
297
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000298 /* If the Unicode data is known at construction time, we can apply
299 some optimizations which share commonly used objects. */
300 if (u != NULL) {
301
302 /* Optimization for empty strings */
303 if (size == 0 && unicode_empty != NULL) {
304 Py_INCREF(unicode_empty);
305 return (PyObject *)unicode_empty;
306 }
307
308 /* Single character Unicode objects in the Latin-1 range are
309 shared when using this constructor */
310 if (size == 1 && *u < 256) {
311 unicode = unicode_latin1[*u];
312 if (!unicode) {
313 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000314 if (!unicode)
315 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000316 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000317 unicode_latin1[*u] = unicode;
318 }
319 Py_INCREF(unicode);
320 return (PyObject *)unicode;
321 }
322 }
323
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 unicode = _PyUnicode_New(size);
325 if (!unicode)
326 return NULL;
327
328 /* Copy the Unicode data into the new object */
329 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331
332 return (PyObject *)unicode;
333}
334
335#ifdef HAVE_WCHAR_H
336
337PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
338 int size)
339{
340 PyUnicodeObject *unicode;
341
342 if (w == NULL) {
343 PyErr_BadInternalCall();
344 return NULL;
345 }
346
347 unicode = _PyUnicode_New(size);
348 if (!unicode)
349 return NULL;
350
351 /* Copy the wchar_t data into the new object */
352#ifdef HAVE_USABLE_WCHAR_T
353 memcpy(unicode->str, w, size * sizeof(wchar_t));
354#else
355 {
356 register Py_UNICODE *u;
357 register int i;
358 u = PyUnicode_AS_UNICODE(unicode);
359 for (i = size; i >= 0; i--)
360 *u++ = *w++;
361 }
362#endif
363
364 return (PyObject *)unicode;
365}
366
367int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
368 register wchar_t *w,
369 int size)
370{
371 if (unicode == NULL) {
372 PyErr_BadInternalCall();
373 return -1;
374 }
375 if (size > PyUnicode_GET_SIZE(unicode))
376 size = PyUnicode_GET_SIZE(unicode);
377#ifdef HAVE_USABLE_WCHAR_T
378 memcpy(w, unicode->str, size * sizeof(wchar_t));
379#else
380 {
381 register Py_UNICODE *u;
382 register int i;
383 u = PyUnicode_AS_UNICODE(unicode);
384 for (i = size; i >= 0; i--)
385 *w++ = *u++;
386 }
387#endif
388
389 return size;
390}
391
392#endif
393
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000394PyObject *PyUnicode_FromOrdinal(int ordinal)
395{
396 Py_UNICODE s[2];
397
398#ifdef Py_UNICODE_WIDE
399 if (ordinal < 0 || ordinal > 0x10ffff) {
400 PyErr_SetString(PyExc_ValueError,
401 "unichr() arg not in range(0x110000) "
402 "(wide Python build)");
403 return NULL;
404 }
405#else
406 if (ordinal < 0 || ordinal > 0xffff) {
407 PyErr_SetString(PyExc_ValueError,
408 "unichr() arg not in range(0x10000) "
409 "(narrow Python build)");
410 return NULL;
411 }
412#endif
413
414 if (ordinal <= 0xffff) {
415 /* UCS-2 character */
416 s[0] = (Py_UNICODE) ordinal;
417 return PyUnicode_FromUnicode(s, 1);
418 }
419 else {
420#ifndef Py_UNICODE_WIDE
421 /* UCS-4 character. store as two surrogate characters */
422 ordinal -= 0x10000L;
423 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
424 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
425 return PyUnicode_FromUnicode(s, 2);
426#else
427 s[0] = (Py_UNICODE)ordinal;
428 return PyUnicode_FromUnicode(s, 1);
429#endif
430 }
431}
432
Guido van Rossumd57fd912000-03-10 22:53:23 +0000433PyObject *PyUnicode_FromObject(register PyObject *obj)
434{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000435 /* XXX Perhaps we should make this API an alias of
436 PyObject_Unicode() instead ?! */
437 if (PyUnicode_CheckExact(obj)) {
438 Py_INCREF(obj);
439 return obj;
440 }
441 if (PyUnicode_Check(obj)) {
442 /* For a Unicode subtype that's not a Unicode object,
443 return a true Unicode object with the same data. */
444 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
445 PyUnicode_GET_SIZE(obj));
446 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
448}
449
450PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
451 const char *encoding,
452 const char *errors)
453{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000454 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000456 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457
458 if (obj == NULL) {
459 PyErr_BadInternalCall();
460 return NULL;
461 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000462
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000463#if 0
464 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000465 that no encodings is given and then redirect to
466 PyObject_Unicode() which then applies the additional logic for
467 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000468
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000469 NOTE: This API should really only be used for object which
470 represent *encoded* Unicode !
471
472 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000473 if (PyUnicode_Check(obj)) {
474 if (encoding) {
475 PyErr_SetString(PyExc_TypeError,
476 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000477 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000478 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000479 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000480 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000481#else
482 if (PyUnicode_Check(obj)) {
483 PyErr_SetString(PyExc_TypeError,
484 "decoding Unicode is not supported");
485 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000486 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000487#endif
488
489 /* Coerce object */
490 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000491 s = PyString_AS_STRING(obj);
492 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000493 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000494 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
495 /* Overwrite the error message with something more useful in
496 case of a TypeError. */
497 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000498 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000499 "coercing to Unicode: need string or buffer, "
500 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000501 obj->ob_type->tp_name);
502 goto onError;
503 }
504
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000505 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 if (len == 0) {
507 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000510 else
511 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000512
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000513 return v;
514
515 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000517}
518
519PyObject *PyUnicode_Decode(const char *s,
520 int size,
521 const char *encoding,
522 const char *errors)
523{
524 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000525
526 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000527 encoding = PyUnicode_GetDefaultEncoding();
528
529 /* Shortcuts for common default encodings */
530 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000531 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000532 else if (strcmp(encoding, "latin-1") == 0)
533 return PyUnicode_DecodeLatin1(s, size, errors);
534 else if (strcmp(encoding, "ascii") == 0)
535 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000536
537 /* Decode via the codec registry */
538 buffer = PyBuffer_FromMemory((void *)s, size);
539 if (buffer == NULL)
540 goto onError;
541 unicode = PyCodec_Decode(buffer, encoding, errors);
542 if (unicode == NULL)
543 goto onError;
544 if (!PyUnicode_Check(unicode)) {
545 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000546 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000547 unicode->ob_type->tp_name);
548 Py_DECREF(unicode);
549 goto onError;
550 }
551 Py_DECREF(buffer);
552 return unicode;
553
554 onError:
555 Py_XDECREF(buffer);
556 return NULL;
557}
558
559PyObject *PyUnicode_Encode(const Py_UNICODE *s,
560 int size,
561 const char *encoding,
562 const char *errors)
563{
564 PyObject *v, *unicode;
565
566 unicode = PyUnicode_FromUnicode(s, size);
567 if (unicode == NULL)
568 return NULL;
569 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
570 Py_DECREF(unicode);
571 return v;
572}
573
574PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
575 const char *encoding,
576 const char *errors)
577{
578 PyObject *v;
579
580 if (!PyUnicode_Check(unicode)) {
581 PyErr_BadArgument();
582 goto onError;
583 }
Fred Drakee4315f52000-05-09 19:53:39 +0000584
585 if (encoding == NULL)
586 encoding = PyUnicode_GetDefaultEncoding();
587
588 /* Shortcuts for common default encodings */
589 if (errors == NULL) {
590 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000591 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000592 else if (strcmp(encoding, "latin-1") == 0)
593 return PyUnicode_AsLatin1String(unicode);
594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_AsASCIIString(unicode);
596 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597
598 /* Encode via the codec registry */
599 v = PyCodec_Encode(unicode, encoding, errors);
600 if (v == NULL)
601 goto onError;
602 /* XXX Should we really enforce this ? */
603 if (!PyString_Check(v)) {
604 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000605 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606 v->ob_type->tp_name);
607 Py_DECREF(v);
608 goto onError;
609 }
610 return v;
611
612 onError:
613 return NULL;
614}
615
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000616PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
617 const char *errors)
618{
619 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
620
621 if (v)
622 return v;
623 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
624 if (v && errors == NULL)
625 ((PyUnicodeObject *)unicode)->defenc = v;
626 return v;
627}
628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
630{
631 if (!PyUnicode_Check(unicode)) {
632 PyErr_BadArgument();
633 goto onError;
634 }
635 return PyUnicode_AS_UNICODE(unicode);
636
637 onError:
638 return NULL;
639}
640
641int PyUnicode_GetSize(PyObject *unicode)
642{
643 if (!PyUnicode_Check(unicode)) {
644 PyErr_BadArgument();
645 goto onError;
646 }
647 return PyUnicode_GET_SIZE(unicode);
648
649 onError:
650 return -1;
651}
652
Thomas Wouters78890102000-07-22 19:25:51 +0000653const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000654{
655 return unicode_default_encoding;
656}
657
658int PyUnicode_SetDefaultEncoding(const char *encoding)
659{
660 PyObject *v;
661
662 /* Make sure the encoding is valid. As side effect, this also
663 loads the encoding into the codec registry cache. */
664 v = _PyCodec_Lookup(encoding);
665 if (v == NULL)
666 goto onError;
667 Py_DECREF(v);
668 strncpy(unicode_default_encoding,
669 encoding,
670 sizeof(unicode_default_encoding));
671 return 0;
672
673 onError:
674 return -1;
675}
676
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000677/* error handling callback helper:
678 build arguments, call the callback and check the arguments,
679 if no exception occured, copy the replacement to the output
680 and adjust various state variables.
681 return 0 on success, -1 on error
682*/
683
684static
685int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
686 const char *encoding, const char *reason,
687 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
688 PyObject **output, int *outpos, Py_UNICODE **outptr)
689{
690 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
691
692 PyObject *restuple = NULL;
693 PyObject *repunicode = NULL;
694 int outsize = PyUnicode_GET_SIZE(*output);
695 int requiredsize;
696 int newpos;
697 Py_UNICODE *repptr;
698 int repsize;
699 int res = -1;
700
701 if (*errorHandler == NULL) {
702 *errorHandler = PyCodec_LookupError(errors);
703 if (*errorHandler == NULL)
704 goto onError;
705 }
706
707 if (*exceptionObject == NULL) {
708 *exceptionObject = PyUnicodeDecodeError_Create(
709 encoding, input, insize, *startinpos, *endinpos, reason);
710 if (*exceptionObject == NULL)
711 goto onError;
712 }
713 else {
714 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
715 goto onError;
716 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
717 goto onError;
718 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
719 goto onError;
720 }
721
722 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
723 if (restuple == NULL)
724 goto onError;
725 if (!PyTuple_Check(restuple)) {
726 PyErr_Format(PyExc_TypeError, &argparse[4]);
727 goto onError;
728 }
729 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
730 goto onError;
731 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000732 newpos = insize+newpos;
733 if (newpos<0 || newpos>insize) {
734 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
735 goto onError;
736 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000737
738 /* need more space? (at least enough for what we
739 have+the replacement+the rest of the string (starting
740 at the new input position), so we won't have to check space
741 when there are no errors in the rest of the string) */
742 repptr = PyUnicode_AS_UNICODE(repunicode);
743 repsize = PyUnicode_GET_SIZE(repunicode);
744 requiredsize = *outpos + repsize + insize-newpos;
745 if (requiredsize > outsize) {
746 if (requiredsize<2*outsize)
747 requiredsize = 2*outsize;
748 if (PyUnicode_Resize(output, requiredsize))
749 goto onError;
750 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
751 }
752 *endinpos = newpos;
753 *inptr = input + newpos;
754 Py_UNICODE_COPY(*outptr, repptr, repsize);
755 *outptr += repsize;
756 *outpos += repsize;
757 /* we made it! */
758 res = 0;
759
760 onError:
761 Py_XDECREF(restuple);
762 return res;
763}
764
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000765/* --- UTF-7 Codec -------------------------------------------------------- */
766
767/* see RFC2152 for details */
768
769static
770char utf7_special[128] = {
771 /* indicate whether a UTF-7 character is special i.e. cannot be directly
772 encoded:
773 0 - not special
774 1 - special
775 2 - whitespace (optional)
776 3 - RFC2152 Set O (optional) */
777 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
778 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
779 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
780 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
781 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
782 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
783 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
784 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
785
786};
787
788#define SPECIAL(c, encodeO, encodeWS) \
789 (((c)>127 || utf7_special[(c)] == 1) || \
790 (encodeWS && (utf7_special[(c)] == 2)) || \
791 (encodeO && (utf7_special[(c)] == 3)))
792
793#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
794#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
795#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
796 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
797
798#define ENCODE(out, ch, bits) \
799 while (bits >= 6) { \
800 *out++ = B64(ch >> (bits-6)); \
801 bits -= 6; \
802 }
803
804#define DECODE(out, ch, bits, surrogate) \
805 while (bits >= 16) { \
806 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
807 bits -= 16; \
808 if (surrogate) { \
809 /* We have already generated an error for the high surrogate
810 so let's not bother seeing if the low surrogate is correct or not */\
811 surrogate = 0; \
812 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
813 /* This is a surrogate pair. Unfortunately we can't represent \
814 it in a 16-bit character */ \
815 surrogate = 1; \
816 errmsg = "code pairs are not supported"; \
817 goto utf7Error; \
818 } else { \
819 *out++ = outCh; \
820 } \
821 } \
822
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000823PyObject *PyUnicode_DecodeUTF7(const char *s,
824 int size,
825 const char *errors)
826{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000827 const char *starts = s;
828 int startinpos;
829 int endinpos;
830 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000831 const char *e;
832 PyUnicodeObject *unicode;
833 Py_UNICODE *p;
834 const char *errmsg = "";
835 int inShift = 0;
836 unsigned int bitsleft = 0;
837 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000838 int surrogate = 0;
839 PyObject *errorHandler = NULL;
840 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000841
842 unicode = _PyUnicode_New(size);
843 if (!unicode)
844 return NULL;
845 if (size == 0)
846 return (PyObject *)unicode;
847
848 p = unicode->str;
849 e = s + size;
850
851 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000852 Py_UNICODE ch;
853 restart:
854 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000855
856 if (inShift) {
857 if ((ch == '-') || !B64CHAR(ch)) {
858 inShift = 0;
859 s++;
860
861 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
862 if (bitsleft >= 6) {
863 /* The shift sequence has a partial character in it. If
864 bitsleft < 6 then we could just classify it as padding
865 but that is not the case here */
866
867 errmsg = "partial character in shift sequence";
868 goto utf7Error;
869 }
870 /* According to RFC2152 the remaining bits should be zero. We
871 choose to signal an error/insert a replacement character
872 here so indicate the potential of a misencoded character. */
873
874 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
875 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
876 errmsg = "non-zero padding bits in shift sequence";
877 goto utf7Error;
878 }
879
880 if (ch == '-') {
881 if ((s < e) && (*(s) == '-')) {
882 *p++ = '-';
883 inShift = 1;
884 }
885 } else if (SPECIAL(ch,0,0)) {
886 errmsg = "unexpected special character";
887 goto utf7Error;
888 } else {
889 *p++ = ch;
890 }
891 } else {
892 charsleft = (charsleft << 6) | UB64(ch);
893 bitsleft += 6;
894 s++;
895 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
896 }
897 }
898 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000899 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000900 s++;
901 if (s < e && *s == '-') {
902 s++;
903 *p++ = '+';
904 } else
905 {
906 inShift = 1;
907 bitsleft = 0;
908 }
909 }
910 else if (SPECIAL(ch,0,0)) {
911 errmsg = "unexpected special character";
912 s++;
913 goto utf7Error;
914 }
915 else {
916 *p++ = ch;
917 s++;
918 }
919 continue;
920 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000921 outpos = p-PyUnicode_AS_UNICODE(unicode);
922 endinpos = s-starts;
923 if (unicode_decode_call_errorhandler(
924 errors, &errorHandler,
925 "utf7", errmsg,
926 starts, size, &startinpos, &endinpos, &exc, &s,
927 (PyObject **)&unicode, &outpos, &p))
928 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000929 }
930
931 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000932 outpos = p-PyUnicode_AS_UNICODE(unicode);
933 endinpos = size;
934 if (unicode_decode_call_errorhandler(
935 errors, &errorHandler,
936 "utf7", "unterminated shift sequence",
937 starts, size, &startinpos, &endinpos, &exc, &s,
938 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000939 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000940 if (s < e)
941 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942 }
943
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000944 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000945 goto onError;
946
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000947 Py_XDECREF(errorHandler);
948 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 return (PyObject *)unicode;
950
951onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000952 Py_XDECREF(errorHandler);
953 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000954 Py_DECREF(unicode);
955 return NULL;
956}
957
958
959PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
960 int size,
961 int encodeSetO,
962 int encodeWhiteSpace,
963 const char *errors)
964{
965 PyObject *v;
966 /* It might be possible to tighten this worst case */
967 unsigned int cbAllocated = 5 * size;
968 int inShift = 0;
969 int i = 0;
970 unsigned int bitsleft = 0;
971 unsigned long charsleft = 0;
972 char * out;
973 char * start;
974
975 if (size == 0)
976 return PyString_FromStringAndSize(NULL, 0);
977
978 v = PyString_FromStringAndSize(NULL, cbAllocated);
979 if (v == NULL)
980 return NULL;
981
982 start = out = PyString_AS_STRING(v);
983 for (;i < size; ++i) {
984 Py_UNICODE ch = s[i];
985
986 if (!inShift) {
987 if (ch == '+') {
988 *out++ = '+';
989 *out++ = '-';
990 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
991 charsleft = ch;
992 bitsleft = 16;
993 *out++ = '+';
994 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
995 inShift = bitsleft > 0;
996 } else {
997 *out++ = (char) ch;
998 }
999 } else {
1000 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1001 *out++ = B64(charsleft << (6-bitsleft));
1002 charsleft = 0;
1003 bitsleft = 0;
1004 /* Characters not in the BASE64 set implicitly unshift the sequence
1005 so no '-' is required, except if the character is itself a '-' */
1006 if (B64CHAR(ch) || ch == '-') {
1007 *out++ = '-';
1008 }
1009 inShift = 0;
1010 *out++ = (char) ch;
1011 } else {
1012 bitsleft += 16;
1013 charsleft = (charsleft << 16) | ch;
1014 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1015
1016 /* If the next character is special then we dont' need to terminate
1017 the shift sequence. If the next character is not a BASE64 character
1018 or '-' then the shift sequence will be terminated implicitly and we
1019 don't have to insert a '-'. */
1020
1021 if (bitsleft == 0) {
1022 if (i + 1 < size) {
1023 Py_UNICODE ch2 = s[i+1];
1024
1025 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1026
1027 } else if (B64CHAR(ch2) || ch2 == '-') {
1028 *out++ = '-';
1029 inShift = 0;
1030 } else {
1031 inShift = 0;
1032 }
1033
1034 }
1035 else {
1036 *out++ = '-';
1037 inShift = 0;
1038 }
1039 }
1040 }
1041 }
1042 }
1043 if (bitsleft) {
1044 *out++= B64(charsleft << (6-bitsleft) );
1045 *out++ = '-';
1046 }
1047
Tim Peters5de98422002-04-27 18:44:32 +00001048 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001049 return v;
1050}
1051
1052#undef SPECIAL
1053#undef B64
1054#undef B64CHAR
1055#undef UB64
1056#undef ENCODE
1057#undef DECODE
1058
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059/* --- UTF-8 Codec -------------------------------------------------------- */
1060
1061static
1062char utf8_code_length[256] = {
1063 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1064 illegal prefix. see RFC 2279 for details */
1065 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1066 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1067 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1068 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1069 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1070 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1071 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1072 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1073 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1074 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1075 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1076 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1077 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1078 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1079 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1080 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1081};
1082
Guido van Rossumd57fd912000-03-10 22:53:23 +00001083PyObject *PyUnicode_DecodeUTF8(const char *s,
1084 int size,
1085 const char *errors)
1086{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001087 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001089 int startinpos;
1090 int endinpos;
1091 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092 const char *e;
1093 PyUnicodeObject *unicode;
1094 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001095 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001096 PyObject *errorHandler = NULL;
1097 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098
1099 /* Note: size will always be longer than the resulting Unicode
1100 character count */
1101 unicode = _PyUnicode_New(size);
1102 if (!unicode)
1103 return NULL;
1104 if (size == 0)
1105 return (PyObject *)unicode;
1106
1107 /* Unpack UTF-8 encoded data */
1108 p = unicode->str;
1109 e = s + size;
1110
1111 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001112 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113
1114 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001115 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116 s++;
1117 continue;
1118 }
1119
1120 n = utf8_code_length[ch];
1121
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001122 if (s + n > e) {
1123 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001124 startinpos = s-starts;
1125 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001126 goto utf8Error;
1127 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128
1129 switch (n) {
1130
1131 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001132 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001133 startinpos = s-starts;
1134 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001135 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001136
1137 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001138 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001139 startinpos = s-starts;
1140 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001141 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001142
1143 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001144 if ((s[1] & 0xc0) != 0x80) {
1145 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001146 startinpos = s-starts;
1147 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001148 goto utf8Error;
1149 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001151 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001152 startinpos = s-starts;
1153 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001154 errmsg = "illegal encoding";
1155 goto utf8Error;
1156 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001157 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001158 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159 break;
1160
1161 case 3:
1162 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001163 (s[2] & 0xc0) != 0x80) {
1164 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001165 startinpos = s-starts;
1166 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001167 goto utf8Error;
1168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001170 if (ch < 0x0800) {
1171 /* Note: UTF-8 encodings of surrogates are considered
1172 legal UTF-8 sequences;
1173
1174 XXX For wide builds (UCS-4) we should probably try
1175 to recombine the surrogates into a single code
1176 unit.
1177 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001178 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001179 startinpos = s-starts;
1180 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001181 goto utf8Error;
1182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001184 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001185 break;
1186
1187 case 4:
1188 if ((s[1] & 0xc0) != 0x80 ||
1189 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001190 (s[3] & 0xc0) != 0x80) {
1191 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001192 startinpos = s-starts;
1193 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001194 goto utf8Error;
1195 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001196 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1197 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1198 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001199 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001200 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001201 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001202 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001203 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001204 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001205 startinpos = s-starts;
1206 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001207 goto utf8Error;
1208 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001209#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001210 *p++ = (Py_UNICODE)ch;
1211#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001212 /* compute and append the two surrogates: */
1213
1214 /* translate from 10000..10FFFF to 0..FFFF */
1215 ch -= 0x10000;
1216
1217 /* high surrogate = top 10 bits added to D800 */
1218 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1219
1220 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001221 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001222#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223 break;
1224
1225 default:
1226 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001227 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001228 startinpos = s-starts;
1229 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001230 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 }
1232 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001233 continue;
1234
1235 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001236 outpos = p-PyUnicode_AS_UNICODE(unicode);
1237 if (unicode_decode_call_errorhandler(
1238 errors, &errorHandler,
1239 "utf8", errmsg,
1240 starts, size, &startinpos, &endinpos, &exc, &s,
1241 (PyObject **)&unicode, &outpos, &p))
1242 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 }
1244
1245 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001246 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 goto onError;
1248
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001249 Py_XDECREF(errorHandler);
1250 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 return (PyObject *)unicode;
1252
1253onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001254 Py_XDECREF(errorHandler);
1255 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 Py_DECREF(unicode);
1257 return NULL;
1258}
1259
Tim Peters602f7402002-04-27 18:03:26 +00001260/* Allocation strategy: if the string is short, convert into a stack buffer
1261 and allocate exactly as much space needed at the end. Else allocate the
1262 maximum possible needed (4 result bytes per Unicode character), and return
1263 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001264*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001265PyObject *
1266PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1267 int size,
1268 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269{
Tim Peters602f7402002-04-27 18:03:26 +00001270#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001271
Tim Peters602f7402002-04-27 18:03:26 +00001272 int i; /* index into s of next input byte */
1273 PyObject *v; /* result string object */
1274 char *p; /* next free byte in output buffer */
1275 int nallocated; /* number of result bytes allocated */
1276 int nneeded; /* number of result bytes needed */
1277 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001278
Tim Peters602f7402002-04-27 18:03:26 +00001279 assert(s != NULL);
1280 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281
Tim Peters602f7402002-04-27 18:03:26 +00001282 if (size <= MAX_SHORT_UNICHARS) {
1283 /* Write into the stack buffer; nallocated can't overflow.
1284 * At the end, we'll allocate exactly as much heap space as it
1285 * turns out we need.
1286 */
1287 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1288 v = NULL; /* will allocate after we're done */
1289 p = stackbuf;
1290 }
1291 else {
1292 /* Overallocate on the heap, and give the excess back at the end. */
1293 nallocated = size * 4;
1294 if (nallocated / 4 != size) /* overflow! */
1295 return PyErr_NoMemory();
1296 v = PyString_FromStringAndSize(NULL, nallocated);
1297 if (v == NULL)
1298 return NULL;
1299 p = PyString_AS_STRING(v);
1300 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001301
Tim Peters602f7402002-04-27 18:03:26 +00001302 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001303 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001304
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001305 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001306 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001308
Guido van Rossumd57fd912000-03-10 22:53:23 +00001309 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001310 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001311 *p++ = (char)(0xc0 | (ch >> 6));
1312 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001313 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001314 else {
Tim Peters602f7402002-04-27 18:03:26 +00001315 /* Encode UCS2 Unicode ordinals */
1316 if (ch < 0x10000) {
1317 /* Special case: check for high surrogate */
1318 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1319 Py_UCS4 ch2 = s[i];
1320 /* Check for low surrogate and combine the two to
1321 form a UCS4 value */
1322 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001323 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001324 i++;
1325 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001326 }
Tim Peters602f7402002-04-27 18:03:26 +00001327 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001328 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001329 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001330 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1331 *p++ = (char)(0x80 | (ch & 0x3f));
1332 continue;
1333 }
1334encodeUCS4:
1335 /* Encode UCS4 Unicode ordinals */
1336 *p++ = (char)(0xf0 | (ch >> 18));
1337 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1338 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1339 *p++ = (char)(0x80 | (ch & 0x3f));
1340 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001341 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001342
Tim Peters602f7402002-04-27 18:03:26 +00001343 if (v == NULL) {
1344 /* This was stack allocated. */
1345 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1346 assert(nneeded <= nallocated);
1347 v = PyString_FromStringAndSize(stackbuf, nneeded);
1348 }
1349 else {
1350 /* Cut back to size actually needed. */
1351 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1352 assert(nneeded <= nallocated);
1353 _PyString_Resize(&v, nneeded);
1354 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001356
Tim Peters602f7402002-04-27 18:03:26 +00001357#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358}
1359
Guido van Rossumd57fd912000-03-10 22:53:23 +00001360PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1361{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362 if (!PyUnicode_Check(unicode)) {
1363 PyErr_BadArgument();
1364 return NULL;
1365 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001366 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1367 PyUnicode_GET_SIZE(unicode),
1368 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001369}
1370
1371/* --- UTF-16 Codec ------------------------------------------------------- */
1372
Tim Peters772747b2001-08-09 22:21:55 +00001373PyObject *
1374PyUnicode_DecodeUTF16(const char *s,
1375 int size,
1376 const char *errors,
1377 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001379 const char *starts = s;
1380 int startinpos;
1381 int endinpos;
1382 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001383 PyUnicodeObject *unicode;
1384 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001385 const unsigned char *q, *e;
1386 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001387 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001388 /* Offsets from q for retrieving byte pairs in the right order. */
1389#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1390 int ihi = 1, ilo = 0;
1391#else
1392 int ihi = 0, ilo = 1;
1393#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001394 PyObject *errorHandler = NULL;
1395 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001396
1397 /* Note: size will always be longer than the resulting Unicode
1398 character count */
1399 unicode = _PyUnicode_New(size);
1400 if (!unicode)
1401 return NULL;
1402 if (size == 0)
1403 return (PyObject *)unicode;
1404
1405 /* Unpack UTF-16 encoded data */
1406 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001407 q = (unsigned char *)s;
1408 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409
1410 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001411 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001413 /* Check for BOM marks (U+FEFF) in the input and adjust current
1414 byte order setting accordingly. In native mode, the leading BOM
1415 mark is skipped, in all other modes, it is copied to the output
1416 stream as-is (giving a ZWNBSP character). */
1417 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001418 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001419#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001420 if (bom == 0xFEFF) {
1421 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001422 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001423 }
1424 else if (bom == 0xFFFE) {
1425 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001426 bo = 1;
1427 }
1428#else
Tim Peters772747b2001-08-09 22:21:55 +00001429 if (bom == 0xFEFF) {
1430 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001431 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001432 }
1433 else if (bom == 0xFFFE) {
1434 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001435 bo = -1;
1436 }
1437#endif
1438 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001439
Tim Peters772747b2001-08-09 22:21:55 +00001440 if (bo == -1) {
1441 /* force LE */
1442 ihi = 1;
1443 ilo = 0;
1444 }
1445 else if (bo == 1) {
1446 /* force BE */
1447 ihi = 0;
1448 ilo = 1;
1449 }
1450
1451 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001452 Py_UNICODE ch;
1453 /* remaing bytes at the end? (size should be even) */
1454 if (e-q<2) {
1455 errmsg = "truncated data";
1456 startinpos = ((const char *)q)-starts;
1457 endinpos = ((const char *)e)-starts;
1458 goto utf16Error;
1459 /* The remaining input chars are ignored if the callback
1460 chooses to skip the input */
1461 }
1462 ch = (q[ihi] << 8) | q[ilo];
1463
Tim Peters772747b2001-08-09 22:21:55 +00001464 q += 2;
1465
Guido van Rossumd57fd912000-03-10 22:53:23 +00001466 if (ch < 0xD800 || ch > 0xDFFF) {
1467 *p++ = ch;
1468 continue;
1469 }
1470
1471 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001472 if (q >= e) {
1473 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001474 startinpos = (((const char *)q)-2)-starts;
1475 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001476 goto utf16Error;
1477 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001478 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001479 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1480 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001481 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001482#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001483 *p++ = ch;
1484 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001485#else
1486 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001487#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001488 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001489 }
1490 else {
1491 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001492 startinpos = (((const char *)q)-4)-starts;
1493 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001494 goto utf16Error;
1495 }
1496
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001498 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001499 startinpos = (((const char *)q)-2)-starts;
1500 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001501 /* Fall through to report the error */
1502
1503 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001504 outpos = p-PyUnicode_AS_UNICODE(unicode);
1505 if (unicode_decode_call_errorhandler(
1506 errors, &errorHandler,
1507 "utf16", errmsg,
1508 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1509 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001510 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001511 }
1512
1513 if (byteorder)
1514 *byteorder = bo;
1515
1516 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001517 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518 goto onError;
1519
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001520 Py_XDECREF(errorHandler);
1521 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001522 return (PyObject *)unicode;
1523
1524onError:
1525 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001526 Py_XDECREF(errorHandler);
1527 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001528 return NULL;
1529}
1530
Tim Peters772747b2001-08-09 22:21:55 +00001531PyObject *
1532PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1533 int size,
1534 const char *errors,
1535 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001536{
1537 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001538 unsigned char *p;
1539 int i, pairs;
1540 /* Offsets from p for storing byte pairs in the right order. */
1541#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1542 int ihi = 1, ilo = 0;
1543#else
1544 int ihi = 0, ilo = 1;
1545#endif
1546
1547#define STORECHAR(CH) \
1548 do { \
1549 p[ihi] = ((CH) >> 8) & 0xff; \
1550 p[ilo] = (CH) & 0xff; \
1551 p += 2; \
1552 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001553
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001554 for (i = pairs = 0; i < size; i++)
1555 if (s[i] >= 0x10000)
1556 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001557 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001558 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001559 if (v == NULL)
1560 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561
Tim Peters772747b2001-08-09 22:21:55 +00001562 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001564 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001565 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001566 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001567
1568 if (byteorder == -1) {
1569 /* force LE */
1570 ihi = 1;
1571 ilo = 0;
1572 }
1573 else if (byteorder == 1) {
1574 /* force BE */
1575 ihi = 0;
1576 ilo = 1;
1577 }
1578
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001579 while (size-- > 0) {
1580 Py_UNICODE ch = *s++;
1581 Py_UNICODE ch2 = 0;
1582 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001583 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1584 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001585 }
Tim Peters772747b2001-08-09 22:21:55 +00001586 STORECHAR(ch);
1587 if (ch2)
1588 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001589 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001590 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001591#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592}
1593
1594PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1595{
1596 if (!PyUnicode_Check(unicode)) {
1597 PyErr_BadArgument();
1598 return NULL;
1599 }
1600 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1601 PyUnicode_GET_SIZE(unicode),
1602 NULL,
1603 0);
1604}
1605
1606/* --- Unicode Escape Codec ----------------------------------------------- */
1607
Fredrik Lundh06d12682001-01-24 07:59:11 +00001608static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001609
Guido van Rossumd57fd912000-03-10 22:53:23 +00001610PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1611 int size,
1612 const char *errors)
1613{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001614 const char *starts = s;
1615 int startinpos;
1616 int endinpos;
1617 int outpos;
1618 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001620 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001621 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001622 char* message;
1623 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001624 PyObject *errorHandler = NULL;
1625 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001626
Guido van Rossumd57fd912000-03-10 22:53:23 +00001627 /* Escaped strings will always be longer than the resulting
1628 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001629 length after conversion to the true value.
1630 (but if the error callback returns a long replacement string
1631 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001632 v = _PyUnicode_New(size);
1633 if (v == NULL)
1634 goto onError;
1635 if (size == 0)
1636 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001637
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001638 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001639 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001640
Guido van Rossumd57fd912000-03-10 22:53:23 +00001641 while (s < end) {
1642 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001643 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001644 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001645
1646 /* Non-escape characters are interpreted as Unicode ordinals */
1647 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001648 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001649 continue;
1650 }
1651
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001652 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001653 /* \ - Escapes */
1654 s++;
1655 switch (*s++) {
1656
1657 /* \x escapes */
1658 case '\n': break;
1659 case '\\': *p++ = '\\'; break;
1660 case '\'': *p++ = '\''; break;
1661 case '\"': *p++ = '\"'; break;
1662 case 'b': *p++ = '\b'; break;
1663 case 'f': *p++ = '\014'; break; /* FF */
1664 case 't': *p++ = '\t'; break;
1665 case 'n': *p++ = '\n'; break;
1666 case 'r': *p++ = '\r'; break;
1667 case 'v': *p++ = '\013'; break; /* VT */
1668 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1669
1670 /* \OOO (octal) escapes */
1671 case '0': case '1': case '2': case '3':
1672 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001673 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001675 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001676 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001677 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001678 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001679 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001680 break;
1681
Fredrik Lundhccc74732001-02-18 22:13:49 +00001682 /* hex escapes */
1683 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001685 digits = 2;
1686 message = "truncated \\xXX escape";
1687 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001688
Fredrik Lundhccc74732001-02-18 22:13:49 +00001689 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001690 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001691 digits = 4;
1692 message = "truncated \\uXXXX escape";
1693 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694
Fredrik Lundhccc74732001-02-18 22:13:49 +00001695 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001696 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001697 digits = 8;
1698 message = "truncated \\UXXXXXXXX escape";
1699 hexescape:
1700 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001701 outpos = p-PyUnicode_AS_UNICODE(v);
1702 if (s+digits>end) {
1703 endinpos = size;
1704 if (unicode_decode_call_errorhandler(
1705 errors, &errorHandler,
1706 "unicodeescape", "end of string in escape sequence",
1707 starts, size, &startinpos, &endinpos, &exc, &s,
1708 (PyObject **)&v, &outpos, &p))
1709 goto onError;
1710 goto nextByte;
1711 }
1712 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001713 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001714 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001715 endinpos = (s+i+1)-starts;
1716 if (unicode_decode_call_errorhandler(
1717 errors, &errorHandler,
1718 "unicodeescape", message,
1719 starts, size, &startinpos, &endinpos, &exc, &s,
1720 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001721 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001722 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001723 }
1724 chr = (chr<<4) & ~0xF;
1725 if (c >= '0' && c <= '9')
1726 chr += c - '0';
1727 else if (c >= 'a' && c <= 'f')
1728 chr += 10 + c - 'a';
1729 else
1730 chr += 10 + c - 'A';
1731 }
1732 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001733 if (chr == 0xffffffff)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001734 /* _decoding_error will have already written into the
1735 target buffer. */
1736 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001737 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001738 /* when we get here, chr is a 32-bit unicode character */
1739 if (chr <= 0xffff)
1740 /* UCS-2 character */
1741 *p++ = (Py_UNICODE) chr;
1742 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001743 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001744 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001745#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001746 *p++ = chr;
1747#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001748 chr -= 0x10000L;
1749 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001750 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001751#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001752 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001753 endinpos = s-starts;
1754 outpos = p-PyUnicode_AS_UNICODE(v);
1755 if (unicode_decode_call_errorhandler(
1756 errors, &errorHandler,
1757 "unicodeescape", "illegal Unicode character",
1758 starts, size, &startinpos, &endinpos, &exc, &s,
1759 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001760 goto onError;
1761 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001762 break;
1763
1764 /* \N{name} */
1765 case 'N':
1766 message = "malformed \\N character escape";
1767 if (ucnhash_CAPI == NULL) {
1768 /* load the unicode data module */
1769 PyObject *m, *v;
1770 m = PyImport_ImportModule("unicodedata");
1771 if (m == NULL)
1772 goto ucnhashError;
1773 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1774 Py_DECREF(m);
1775 if (v == NULL)
1776 goto ucnhashError;
1777 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1778 Py_DECREF(v);
1779 if (ucnhash_CAPI == NULL)
1780 goto ucnhashError;
1781 }
1782 if (*s == '{') {
1783 const char *start = s+1;
1784 /* look for the closing brace */
1785 while (*s != '}' && s < end)
1786 s++;
1787 if (s > start && s < end && *s == '}') {
1788 /* found a name. look it up in the unicode database */
1789 message = "unknown Unicode character name";
1790 s++;
1791 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1792 goto store;
1793 }
1794 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001795 endinpos = s-starts;
1796 outpos = p-PyUnicode_AS_UNICODE(v);
1797 if (unicode_decode_call_errorhandler(
1798 errors, &errorHandler,
1799 "unicodeescape", message,
1800 starts, size, &startinpos, &endinpos, &exc, &s,
1801 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001802 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001803 break;
1804
1805 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001806 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001807 message = "\\ at end of string";
1808 s--;
1809 endinpos = s-starts;
1810 outpos = p-PyUnicode_AS_UNICODE(v);
1811 if (unicode_decode_call_errorhandler(
1812 errors, &errorHandler,
1813 "unicodeescape", message,
1814 starts, size, &startinpos, &endinpos, &exc, &s,
1815 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001816 goto onError;
1817 }
1818 else {
1819 *p++ = '\\';
1820 *p++ = (unsigned char)s[-1];
1821 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001822 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001823 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001824 nextByte:
1825 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001826 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001827 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
1828 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001830
Fredrik Lundhccc74732001-02-18 22:13:49 +00001831ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001832 PyErr_SetString(
1833 PyExc_UnicodeError,
1834 "\\N escapes not supported (can't load unicodedata module)"
1835 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001836 Py_XDECREF(errorHandler);
1837 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001838 return NULL;
1839
Fredrik Lundhccc74732001-02-18 22:13:49 +00001840onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001842 Py_XDECREF(errorHandler);
1843 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 return NULL;
1845}
1846
1847/* Return a Unicode-Escape string version of the Unicode object.
1848
1849 If quotes is true, the string is enclosed in u"" or u'' quotes as
1850 appropriate.
1851
1852*/
1853
Barry Warsaw51ac5802000-03-20 16:36:48 +00001854static const Py_UNICODE *findchar(const Py_UNICODE *s,
1855 int size,
1856 Py_UNICODE ch);
1857
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858static
1859PyObject *unicodeescape_string(const Py_UNICODE *s,
1860 int size,
1861 int quotes)
1862{
1863 PyObject *repr;
1864 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001866 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001867
1868 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1869 if (repr == NULL)
1870 return NULL;
1871
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001872 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873
1874 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875 *p++ = 'u';
1876 *p++ = (findchar(s, size, '\'') &&
1877 !findchar(s, size, '"')) ? '"' : '\'';
1878 }
1879 while (size-- > 0) {
1880 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001881
Guido van Rossumd57fd912000-03-10 22:53:23 +00001882 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001883 if (quotes &&
1884 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001885 *p++ = '\\';
1886 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001887 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001888 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001889
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001890#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001891 /* Map 21-bit characters to '\U00xxxxxx' */
1892 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001893 int offset = p - PyString_AS_STRING(repr);
1894
1895 /* Resize the string if necessary */
1896 if (offset + 12 > PyString_GET_SIZE(repr)) {
1897 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001898 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001899 p = PyString_AS_STRING(repr) + offset;
1900 }
1901
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001902 *p++ = '\\';
1903 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001904 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1905 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1906 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1907 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1908 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1909 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1910 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001911 *p++ = hexdigit[ch & 0x0000000F];
1912 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001913 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001914#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001915 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1916 else if (ch >= 0xD800 && ch < 0xDC00) {
1917 Py_UNICODE ch2;
1918 Py_UCS4 ucs;
1919
1920 ch2 = *s++;
1921 size--;
1922 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1923 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1924 *p++ = '\\';
1925 *p++ = 'U';
1926 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1927 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1928 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1929 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1930 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1931 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1932 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1933 *p++ = hexdigit[ucs & 0x0000000F];
1934 continue;
1935 }
1936 /* Fall through: isolated surrogates are copied as-is */
1937 s--;
1938 size++;
1939 }
1940
Guido van Rossumd57fd912000-03-10 22:53:23 +00001941 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001942 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943 *p++ = '\\';
1944 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001945 *p++ = hexdigit[(ch >> 12) & 0x000F];
1946 *p++ = hexdigit[(ch >> 8) & 0x000F];
1947 *p++ = hexdigit[(ch >> 4) & 0x000F];
1948 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001950
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001951 /* Map special whitespace to '\t', \n', '\r' */
1952 else if (ch == '\t') {
1953 *p++ = '\\';
1954 *p++ = 't';
1955 }
1956 else if (ch == '\n') {
1957 *p++ = '\\';
1958 *p++ = 'n';
1959 }
1960 else if (ch == '\r') {
1961 *p++ = '\\';
1962 *p++ = 'r';
1963 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001964
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001965 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001966 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001968 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001969 *p++ = hexdigit[(ch >> 4) & 0x000F];
1970 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001972
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973 /* Copy everything else as-is */
1974 else
1975 *p++ = (char) ch;
1976 }
1977 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001978 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979
1980 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00001981 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982 return repr;
1983}
1984
1985PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1986 int size)
1987{
1988 return unicodeescape_string(s, size, 0);
1989}
1990
1991PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1992{
1993 if (!PyUnicode_Check(unicode)) {
1994 PyErr_BadArgument();
1995 return NULL;
1996 }
1997 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1998 PyUnicode_GET_SIZE(unicode));
1999}
2000
2001/* --- Raw Unicode Escape Codec ------------------------------------------- */
2002
2003PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2004 int size,
2005 const char *errors)
2006{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002007 const char *starts = s;
2008 int startinpos;
2009 int endinpos;
2010 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002012 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013 const char *end;
2014 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002015 PyObject *errorHandler = NULL;
2016 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017
2018 /* Escaped strings will always be longer than the resulting
2019 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002020 length after conversion to the true value. (But decoding error
2021 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022 v = _PyUnicode_New(size);
2023 if (v == NULL)
2024 goto onError;
2025 if (size == 0)
2026 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002027 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002028 end = s + size;
2029 while (s < end) {
2030 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002031 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002032 int i;
2033
2034 /* Non-escape characters are interpreted as Unicode ordinals */
2035 if (*s != '\\') {
2036 *p++ = (unsigned char)*s++;
2037 continue;
2038 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002039 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040
2041 /* \u-escapes are only interpreted iff the number of leading
2042 backslashes if odd */
2043 bs = s;
2044 for (;s < end;) {
2045 if (*s != '\\')
2046 break;
2047 *p++ = (unsigned char)*s++;
2048 }
2049 if (((s - bs) & 1) == 0 ||
2050 s >= end ||
2051 *s != 'u') {
2052 continue;
2053 }
2054 p--;
2055 s++;
2056
2057 /* \uXXXX with 4 hex digits */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002058 outpos = p-PyUnicode_AS_UNICODE(v);
2059 for (x = 0, i = 0; i < 4; ++i, ++s) {
2060 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002062 endinpos = s-starts;
2063 if (unicode_decode_call_errorhandler(
2064 errors, &errorHandler,
2065 "rawunicodeescape", "truncated \\uXXXX",
2066 starts, size, &startinpos, &endinpos, &exc, &s,
2067 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002069 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070 }
2071 x = (x<<4) & ~0xF;
2072 if (c >= '0' && c <= '9')
2073 x += c - '0';
2074 else if (c >= 'a' && c <= 'f')
2075 x += 10 + c - 'a';
2076 else
2077 x += 10 + c - 'A';
2078 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002079 *p++ = x;
2080 nextByte:
2081 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002083 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002084 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002085 Py_XDECREF(errorHandler);
2086 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087 return (PyObject *)v;
2088
2089 onError:
2090 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002091 Py_XDECREF(errorHandler);
2092 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093 return NULL;
2094}
2095
2096PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2097 int size)
2098{
2099 PyObject *repr;
2100 char *p;
2101 char *q;
2102
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002103 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104
2105 repr = PyString_FromStringAndSize(NULL, 6 * size);
2106 if (repr == NULL)
2107 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002108 if (size == 0)
2109 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110
2111 p = q = PyString_AS_STRING(repr);
2112 while (size-- > 0) {
2113 Py_UNICODE ch = *s++;
2114 /* Map 16-bit characters to '\uxxxx' */
2115 if (ch >= 256) {
2116 *p++ = '\\';
2117 *p++ = 'u';
2118 *p++ = hexdigit[(ch >> 12) & 0xf];
2119 *p++ = hexdigit[(ch >> 8) & 0xf];
2120 *p++ = hexdigit[(ch >> 4) & 0xf];
2121 *p++ = hexdigit[ch & 15];
2122 }
2123 /* Copy everything else as-is */
2124 else
2125 *p++ = (char) ch;
2126 }
2127 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002128 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129 return repr;
2130}
2131
2132PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2133{
2134 if (!PyUnicode_Check(unicode)) {
2135 PyErr_BadArgument();
2136 return NULL;
2137 }
2138 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2139 PyUnicode_GET_SIZE(unicode));
2140}
2141
2142/* --- Latin-1 Codec ------------------------------------------------------ */
2143
2144PyObject *PyUnicode_DecodeLatin1(const char *s,
2145 int size,
2146 const char *errors)
2147{
2148 PyUnicodeObject *v;
2149 Py_UNICODE *p;
2150
2151 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002152 if (size == 1 && *(unsigned char*)s < 256) {
2153 Py_UNICODE r = *(unsigned char*)s;
2154 return PyUnicode_FromUnicode(&r, 1);
2155 }
2156
Guido van Rossumd57fd912000-03-10 22:53:23 +00002157 v = _PyUnicode_New(size);
2158 if (v == NULL)
2159 goto onError;
2160 if (size == 0)
2161 return (PyObject *)v;
2162 p = PyUnicode_AS_UNICODE(v);
2163 while (size-- > 0)
2164 *p++ = (unsigned char)*s++;
2165 return (PyObject *)v;
2166
2167 onError:
2168 Py_XDECREF(v);
2169 return NULL;
2170}
2171
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002172/* create or adjust a UnicodeEncodeError */
2173static void make_encode_exception(PyObject **exceptionObject,
2174 const char *encoding,
2175 const Py_UNICODE *unicode, int size,
2176 int startpos, int endpos,
2177 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002178{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002179 if (*exceptionObject == NULL) {
2180 *exceptionObject = PyUnicodeEncodeError_Create(
2181 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 }
2183 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002184 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2185 goto onError;
2186 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2187 goto onError;
2188 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2189 goto onError;
2190 return;
2191 onError:
2192 Py_DECREF(*exceptionObject);
2193 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002194 }
2195}
2196
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002197/* raises a UnicodeEncodeError */
2198static void raise_encode_exception(PyObject **exceptionObject,
2199 const char *encoding,
2200 const Py_UNICODE *unicode, int size,
2201 int startpos, int endpos,
2202 const char *reason)
2203{
2204 make_encode_exception(exceptionObject,
2205 encoding, unicode, size, startpos, endpos, reason);
2206 if (*exceptionObject != NULL)
2207 PyCodec_StrictErrors(*exceptionObject);
2208}
2209
2210/* error handling callback helper:
2211 build arguments, call the callback and check the arguments,
2212 put the result into newpos and return the replacement string, which
2213 has to be freed by the caller */
2214static PyObject *unicode_encode_call_errorhandler(const char *errors,
2215 PyObject **errorHandler,
2216 const char *encoding, const char *reason,
2217 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2218 int startpos, int endpos,
2219 int *newpos)
2220{
2221 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2222
2223 PyObject *restuple;
2224 PyObject *resunicode;
2225
2226 if (*errorHandler == NULL) {
2227 *errorHandler = PyCodec_LookupError(errors);
2228 if (*errorHandler == NULL)
2229 return NULL;
2230 }
2231
2232 make_encode_exception(exceptionObject,
2233 encoding, unicode, size, startpos, endpos, reason);
2234 if (*exceptionObject == NULL)
2235 return NULL;
2236
2237 restuple = PyObject_CallFunctionObjArgs(
2238 *errorHandler, *exceptionObject, NULL);
2239 if (restuple == NULL)
2240 return NULL;
2241 if (!PyTuple_Check(restuple)) {
2242 PyErr_Format(PyExc_TypeError, &argparse[4]);
2243 Py_DECREF(restuple);
2244 return NULL;
2245 }
2246 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2247 &resunicode, newpos)) {
2248 Py_DECREF(restuple);
2249 return NULL;
2250 }
2251 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002252 *newpos = size+*newpos;
2253 if (*newpos<0 || *newpos>size) {
2254 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2255 Py_DECREF(restuple);
2256 return NULL;
2257 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002258 Py_INCREF(resunicode);
2259 Py_DECREF(restuple);
2260 return resunicode;
2261}
2262
2263static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2264 int size,
2265 const char *errors,
2266 int limit)
2267{
2268 /* output object */
2269 PyObject *res;
2270 /* pointers to the beginning and end+1 of input */
2271 const Py_UNICODE *startp = p;
2272 const Py_UNICODE *endp = p + size;
2273 /* pointer to the beginning of the unencodable characters */
2274 /* const Py_UNICODE *badp = NULL; */
2275 /* pointer into the output */
2276 char *str;
2277 /* current output position */
2278 int respos = 0;
2279 int ressize;
2280 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2281 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2282 PyObject *errorHandler = NULL;
2283 PyObject *exc = NULL;
2284 /* the following variable is used for caching string comparisons
2285 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2286 int known_errorHandler = -1;
2287
2288 /* allocate enough for a simple encoding without
2289 replacements, if we need more, we'll resize */
2290 res = PyString_FromStringAndSize(NULL, size);
2291 if (res == NULL)
2292 goto onError;
2293 if (size == 0)
2294 return res;
2295 str = PyString_AS_STRING(res);
2296 ressize = size;
2297
2298 while (p<endp) {
2299 Py_UNICODE c = *p;
2300
2301 /* can we encode this? */
2302 if (c<limit) {
2303 /* no overflow check, because we know that the space is enough */
2304 *str++ = (char)c;
2305 ++p;
2306 }
2307 else {
2308 int unicodepos = p-startp;
2309 int requiredsize;
2310 PyObject *repunicode;
2311 int repsize;
2312 int newpos;
2313 int respos;
2314 Py_UNICODE *uni2;
2315 /* startpos for collecting unencodable chars */
2316 const Py_UNICODE *collstart = p;
2317 const Py_UNICODE *collend = p;
2318 /* find all unecodable characters */
2319 while ((collend < endp) && ((*collend)>=limit))
2320 ++collend;
2321 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2322 if (known_errorHandler==-1) {
2323 if ((errors==NULL) || (!strcmp(errors, "strict")))
2324 known_errorHandler = 1;
2325 else if (!strcmp(errors, "replace"))
2326 known_errorHandler = 2;
2327 else if (!strcmp(errors, "ignore"))
2328 known_errorHandler = 3;
2329 else if (!strcmp(errors, "xmlcharrefreplace"))
2330 known_errorHandler = 4;
2331 else
2332 known_errorHandler = 0;
2333 }
2334 switch (known_errorHandler) {
2335 case 1: /* strict */
2336 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2337 goto onError;
2338 case 2: /* replace */
2339 while (collstart++<collend)
2340 *str++ = '?'; /* fall through */
2341 case 3: /* ignore */
2342 p = collend;
2343 break;
2344 case 4: /* xmlcharrefreplace */
2345 respos = str-PyString_AS_STRING(res);
2346 /* determine replacement size (temporarily (mis)uses p) */
2347 for (p = collstart, repsize = 0; p < collend; ++p) {
2348 if (*p<10)
2349 repsize += 2+1+1;
2350 else if (*p<100)
2351 repsize += 2+2+1;
2352 else if (*p<1000)
2353 repsize += 2+3+1;
2354 else if (*p<10000)
2355 repsize += 2+4+1;
2356 else if (*p<100000)
2357 repsize += 2+5+1;
2358 else if (*p<1000000)
2359 repsize += 2+6+1;
2360 else
2361 repsize += 2+7+1;
2362 }
2363 requiredsize = respos+repsize+(endp-collend);
2364 if (requiredsize > ressize) {
2365 if (requiredsize<2*ressize)
2366 requiredsize = 2*ressize;
2367 if (_PyString_Resize(&res, requiredsize))
2368 goto onError;
2369 str = PyString_AS_STRING(res) + respos;
2370 ressize = requiredsize;
2371 }
2372 /* generate replacement (temporarily (mis)uses p) */
2373 for (p = collstart; p < collend; ++p) {
2374 str += sprintf(str, "&#%d;", (int)*p);
2375 }
2376 p = collend;
2377 break;
2378 default:
2379 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2380 encoding, reason, startp, size, &exc,
2381 collstart-startp, collend-startp, &newpos);
2382 if (repunicode == NULL)
2383 goto onError;
2384 /* need more space? (at least enough for what we
2385 have+the replacement+the rest of the string, so
2386 we won't have to check space for encodable characters) */
2387 respos = str-PyString_AS_STRING(res);
2388 repsize = PyUnicode_GET_SIZE(repunicode);
2389 requiredsize = respos+repsize+(endp-collend);
2390 if (requiredsize > ressize) {
2391 if (requiredsize<2*ressize)
2392 requiredsize = 2*ressize;
2393 if (_PyString_Resize(&res, requiredsize)) {
2394 Py_DECREF(repunicode);
2395 goto onError;
2396 }
2397 str = PyString_AS_STRING(res) + respos;
2398 ressize = requiredsize;
2399 }
2400 /* check if there is anything unencodable in the replacement
2401 and copy it to the output */
2402 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2403 c = *uni2;
2404 if (c >= limit) {
2405 raise_encode_exception(&exc, encoding, startp, size,
2406 unicodepos, unicodepos+1, reason);
2407 Py_DECREF(repunicode);
2408 goto onError;
2409 }
2410 *str = (char)c;
2411 }
2412 p = startp + newpos;
2413 Py_DECREF(repunicode);
2414 }
2415 }
2416 }
2417 /* Resize if we allocated to much */
2418 respos = str-PyString_AS_STRING(res);
2419 if (respos<ressize)
2420 /* If this falls res will be NULL */
2421 _PyString_Resize(&res, respos);
2422 Py_XDECREF(errorHandler);
2423 Py_XDECREF(exc);
2424 return res;
2425
2426 onError:
2427 Py_XDECREF(res);
2428 Py_XDECREF(errorHandler);
2429 Py_XDECREF(exc);
2430 return NULL;
2431}
2432
Guido van Rossumd57fd912000-03-10 22:53:23 +00002433PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2434 int size,
2435 const char *errors)
2436{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002437 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002438}
2439
2440PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2441{
2442 if (!PyUnicode_Check(unicode)) {
2443 PyErr_BadArgument();
2444 return NULL;
2445 }
2446 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2447 PyUnicode_GET_SIZE(unicode),
2448 NULL);
2449}
2450
2451/* --- 7-bit ASCII Codec -------------------------------------------------- */
2452
Guido van Rossumd57fd912000-03-10 22:53:23 +00002453PyObject *PyUnicode_DecodeASCII(const char *s,
2454 int size,
2455 const char *errors)
2456{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002457 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458 PyUnicodeObject *v;
2459 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002460 int startinpos;
2461 int endinpos;
2462 int outpos;
2463 const char *e;
2464 PyObject *errorHandler = NULL;
2465 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002466
2467 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002468 if (size == 1 && *(unsigned char*)s < 128) {
2469 Py_UNICODE r = *(unsigned char*)s;
2470 return PyUnicode_FromUnicode(&r, 1);
2471 }
2472
Guido van Rossumd57fd912000-03-10 22:53:23 +00002473 v = _PyUnicode_New(size);
2474 if (v == NULL)
2475 goto onError;
2476 if (size == 0)
2477 return (PyObject *)v;
2478 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002479 e = s + size;
2480 while (s < e) {
2481 register unsigned char c = (unsigned char)*s;
2482 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002484 ++s;
2485 }
2486 else {
2487 startinpos = s-starts;
2488 endinpos = startinpos + 1;
2489 outpos = p-PyUnicode_AS_UNICODE(v);
2490 if (unicode_decode_call_errorhandler(
2491 errors, &errorHandler,
2492 "ascii", "ordinal not in range(128)",
2493 starts, size, &startinpos, &endinpos, &exc, &s,
2494 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002498 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002499 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002500 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002501 Py_XDECREF(errorHandler);
2502 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503 return (PyObject *)v;
2504
2505 onError:
2506 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002507 Py_XDECREF(errorHandler);
2508 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509 return NULL;
2510}
2511
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2513 int size,
2514 const char *errors)
2515{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002516 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002517}
2518
2519PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2520{
2521 if (!PyUnicode_Check(unicode)) {
2522 PyErr_BadArgument();
2523 return NULL;
2524 }
2525 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2526 PyUnicode_GET_SIZE(unicode),
2527 NULL);
2528}
2529
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002530#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002531
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002532/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002533
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002534PyObject *PyUnicode_DecodeMBCS(const char *s,
2535 int size,
2536 const char *errors)
2537{
2538 PyUnicodeObject *v;
2539 Py_UNICODE *p;
2540
2541 /* First get the size of the result */
2542 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002543 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002544 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2545
2546 v = _PyUnicode_New(usize);
2547 if (v == NULL)
2548 return NULL;
2549 if (usize == 0)
2550 return (PyObject *)v;
2551 p = PyUnicode_AS_UNICODE(v);
2552 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2553 Py_DECREF(v);
2554 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2555 }
2556
2557 return (PyObject *)v;
2558}
2559
2560PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2561 int size,
2562 const char *errors)
2563{
2564 PyObject *repr;
2565 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002566 DWORD mbcssize;
2567
2568 /* If there are no characters, bail now! */
2569 if (size==0)
2570 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002571
2572 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002573 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002574 if (mbcssize==0)
2575 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2576
2577 repr = PyString_FromStringAndSize(NULL, mbcssize);
2578 if (repr == NULL)
2579 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002580 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002581 return repr;
2582
2583 /* Do the conversion */
2584 s = PyString_AS_STRING(repr);
2585 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2586 Py_DECREF(repr);
2587 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2588 }
2589 return repr;
2590}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002591
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002592#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002593
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594/* --- Character Mapping Codec -------------------------------------------- */
2595
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596PyObject *PyUnicode_DecodeCharmap(const char *s,
2597 int size,
2598 PyObject *mapping,
2599 const char *errors)
2600{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002601 const char *starts = s;
2602 int startinpos;
2603 int endinpos;
2604 int outpos;
2605 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002606 PyUnicodeObject *v;
2607 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002608 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002609 PyObject *errorHandler = NULL;
2610 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611
2612 /* Default to Latin-1 */
2613 if (mapping == NULL)
2614 return PyUnicode_DecodeLatin1(s, size, errors);
2615
2616 v = _PyUnicode_New(size);
2617 if (v == NULL)
2618 goto onError;
2619 if (size == 0)
2620 return (PyObject *)v;
2621 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002622 e = s + size;
2623 while (s < e) {
2624 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002625 PyObject *w, *x;
2626
2627 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2628 w = PyInt_FromLong((long)ch);
2629 if (w == NULL)
2630 goto onError;
2631 x = PyObject_GetItem(mapping, w);
2632 Py_DECREF(w);
2633 if (x == NULL) {
2634 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002635 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002636 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002637 x = Py_None;
2638 Py_INCREF(x);
2639 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002640 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641 }
2642
2643 /* Apply mapping */
2644 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002645 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002646 if (value < 0 || value > 65535) {
2647 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002648 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649 Py_DECREF(x);
2650 goto onError;
2651 }
2652 *p++ = (Py_UNICODE)value;
2653 }
2654 else if (x == Py_None) {
2655 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002656 outpos = p-PyUnicode_AS_UNICODE(v);
2657 startinpos = s-starts;
2658 endinpos = startinpos+1;
2659 if (unicode_decode_call_errorhandler(
2660 errors, &errorHandler,
2661 "charmap", "character maps to <undefined>",
2662 starts, size, &startinpos, &endinpos, &exc, &s,
2663 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664 Py_DECREF(x);
2665 goto onError;
2666 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002667 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668 }
2669 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002670 int targetsize = PyUnicode_GET_SIZE(x);
2671
2672 if (targetsize == 1)
2673 /* 1-1 mapping */
2674 *p++ = *PyUnicode_AS_UNICODE(x);
2675
2676 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002678 if (targetsize > extrachars) {
2679 /* resize first */
2680 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2681 int needed = (targetsize - extrachars) + \
2682 (targetsize << 2);
2683 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002684 if (_PyUnicode_Resize(&v,
2685 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002686 Py_DECREF(x);
2687 goto onError;
2688 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002689 p = PyUnicode_AS_UNICODE(v) + oldpos;
2690 }
2691 Py_UNICODE_COPY(p,
2692 PyUnicode_AS_UNICODE(x),
2693 targetsize);
2694 p += targetsize;
2695 extrachars -= targetsize;
2696 }
2697 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 }
2699 else {
2700 /* wrong return value */
2701 PyErr_SetString(PyExc_TypeError,
2702 "character mapping must return integer, None or unicode");
2703 Py_DECREF(x);
2704 goto onError;
2705 }
2706 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002707 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002708 }
2709 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002710 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002711 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002712 Py_XDECREF(errorHandler);
2713 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714 return (PyObject *)v;
2715
2716 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002717 Py_XDECREF(errorHandler);
2718 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002719 Py_XDECREF(v);
2720 return NULL;
2721}
2722
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002723/* Lookup the character ch in the mapping. If the character
2724 can't be found, Py_None is returned (or NULL, if another
2725 error occured). */
2726static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002728 PyObject *w = PyInt_FromLong((long)c);
2729 PyObject *x;
2730
2731 if (w == NULL)
2732 return NULL;
2733 x = PyObject_GetItem(mapping, w);
2734 Py_DECREF(w);
2735 if (x == NULL) {
2736 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2737 /* No mapping found means: mapping is undefined. */
2738 PyErr_Clear();
2739 x = Py_None;
2740 Py_INCREF(x);
2741 return x;
2742 } else
2743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002745 else if (x == Py_None)
2746 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002747 else if (PyInt_Check(x)) {
2748 long value = PyInt_AS_LONG(x);
2749 if (value < 0 || value > 255) {
2750 PyErr_SetString(PyExc_TypeError,
2751 "character mapping must be in range(256)");
2752 Py_DECREF(x);
2753 return NULL;
2754 }
2755 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002757 else if (PyString_Check(x))
2758 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002760 /* wrong return value */
2761 PyErr_SetString(PyExc_TypeError,
2762 "character mapping must return integer, None or str");
2763 Py_DECREF(x);
2764 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765 }
2766}
2767
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002768/* lookup the character, put the result in the output string and adjust
2769 various state variables. Reallocate the output string if not enough
2770 space is available. Return a new reference to the object that
2771 was put in the output buffer, or Py_None, if the mapping was undefined
2772 (in which case no character was written) or NULL, if a
2773 reallocation error ocurred. The called must decref the result */
2774static
2775PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2776 PyObject **outobj, int *outpos)
2777{
2778 PyObject *rep = charmapencode_lookup(c, mapping);
2779
2780 if (rep==NULL)
2781 return NULL;
2782 else if (rep==Py_None)
2783 return rep;
2784 else {
2785 char *outstart = PyString_AS_STRING(*outobj);
2786 int outsize = PyString_GET_SIZE(*outobj);
2787 if (PyInt_Check(rep)) {
2788 int requiredsize = *outpos+1;
2789 if (outsize<requiredsize) {
2790 /* exponentially overallocate to minimize reallocations */
2791 if (requiredsize < 2*outsize)
2792 requiredsize = 2*outsize;
2793 if (_PyString_Resize(outobj, requiredsize)) {
2794 Py_DECREF(rep);
2795 return NULL;
2796 }
2797 outstart = PyString_AS_STRING(*outobj);
2798 }
2799 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2800 }
2801 else {
2802 const char *repchars = PyString_AS_STRING(rep);
2803 int repsize = PyString_GET_SIZE(rep);
2804 int requiredsize = *outpos+repsize;
2805 if (outsize<requiredsize) {
2806 /* exponentially overallocate to minimize reallocations */
2807 if (requiredsize < 2*outsize)
2808 requiredsize = 2*outsize;
2809 if (_PyString_Resize(outobj, requiredsize)) {
2810 Py_DECREF(rep);
2811 return NULL;
2812 }
2813 outstart = PyString_AS_STRING(*outobj);
2814 }
2815 memcpy(outstart + *outpos, repchars, repsize);
2816 *outpos += repsize;
2817 }
2818 }
2819 return rep;
2820}
2821
2822/* handle an error in PyUnicode_EncodeCharmap
2823 Return 0 on success, -1 on error */
2824static
2825int charmap_encoding_error(
2826 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2827 PyObject **exceptionObject,
2828 int *known_errorHandler, PyObject *errorHandler, const char *errors,
2829 PyObject **res, int *respos)
2830{
2831 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2832 int repsize;
2833 int newpos;
2834 Py_UNICODE *uni2;
2835 /* startpos for collecting unencodable chars */
2836 int collstartpos = *inpos;
2837 int collendpos = *inpos+1;
2838 int collpos;
2839 char *encoding = "charmap";
2840 char *reason = "character maps to <undefined>";
2841
2842 PyObject *x;
2843 /* find all unencodable characters */
2844 while (collendpos < size) {
2845 x = charmapencode_lookup(p[collendpos], mapping);
2846 if (x==NULL)
2847 return -1;
2848 else if (x!=Py_None) {
2849 Py_DECREF(x);
2850 break;
2851 }
2852 Py_DECREF(x);
2853 ++collendpos;
2854 }
2855 /* cache callback name lookup
2856 * (if not done yet, i.e. it's the first error) */
2857 if (*known_errorHandler==-1) {
2858 if ((errors==NULL) || (!strcmp(errors, "strict")))
2859 *known_errorHandler = 1;
2860 else if (!strcmp(errors, "replace"))
2861 *known_errorHandler = 2;
2862 else if (!strcmp(errors, "ignore"))
2863 *known_errorHandler = 3;
2864 else if (!strcmp(errors, "xmlcharrefreplace"))
2865 *known_errorHandler = 4;
2866 else
2867 *known_errorHandler = 0;
2868 }
2869 switch (*known_errorHandler) {
2870 case 1: /* strict */
2871 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2872 return -1;
2873 case 2: /* replace */
2874 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2875 x = charmapencode_output('?', mapping, res, respos);
2876 if (x==NULL) {
2877 return -1;
2878 }
2879 else if (x==Py_None) {
2880 Py_DECREF(x);
2881 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2882 return -1;
2883 }
2884 Py_DECREF(x);
2885 }
2886 /* fall through */
2887 case 3: /* ignore */
2888 *inpos = collendpos;
2889 break;
2890 case 4: /* xmlcharrefreplace */
2891 /* generate replacement (temporarily (mis)uses p) */
2892 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2893 char buffer[2+29+1+1];
2894 char *cp;
2895 sprintf(buffer, "&#%d;", (int)p[collpos]);
2896 for (cp = buffer; *cp; ++cp) {
2897 x = charmapencode_output(*cp, mapping, res, respos);
2898 if (x==NULL)
2899 return -1;
2900 else if (x==Py_None) {
2901 Py_DECREF(x);
2902 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2903 return -1;
2904 }
2905 Py_DECREF(x);
2906 }
2907 }
2908 *inpos = collendpos;
2909 break;
2910 default:
2911 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2912 encoding, reason, p, size, exceptionObject,
2913 collstartpos, collendpos, &newpos);
2914 if (repunicode == NULL)
2915 return -1;
2916 /* generate replacement */
2917 repsize = PyUnicode_GET_SIZE(repunicode);
2918 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2919 x = charmapencode_output(*uni2, mapping, res, respos);
2920 if (x==NULL) {
2921 Py_DECREF(repunicode);
2922 return -1;
2923 }
2924 else if (x==Py_None) {
2925 Py_DECREF(repunicode);
2926 Py_DECREF(x);
2927 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2928 return -1;
2929 }
2930 Py_DECREF(x);
2931 }
2932 *inpos = newpos;
2933 Py_DECREF(repunicode);
2934 }
2935 return 0;
2936}
2937
Guido van Rossumd57fd912000-03-10 22:53:23 +00002938PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2939 int size,
2940 PyObject *mapping,
2941 const char *errors)
2942{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002943 /* output object */
2944 PyObject *res = NULL;
2945 /* current input position */
2946 int inpos = 0;
2947 /* current output position */
2948 int respos = 0;
2949 PyObject *errorHandler = NULL;
2950 PyObject *exc = NULL;
2951 /* the following variable is used for caching string comparisons
2952 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
2953 * 3=ignore, 4=xmlcharrefreplace */
2954 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955
2956 /* Default to Latin-1 */
2957 if (mapping == NULL)
2958 return PyUnicode_EncodeLatin1(p, size, errors);
2959
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002960 /* allocate enough for a simple encoding without
2961 replacements, if we need more, we'll resize */
2962 res = PyString_FromStringAndSize(NULL, size);
2963 if (res == NULL)
2964 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002965 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002966 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002968 while (inpos<size) {
2969 /* try to encode it */
2970 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
2971 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002973 if (x==Py_None) { /* unencodable character */
2974 if (charmap_encoding_error(p, size, &inpos, mapping,
2975 &exc,
2976 &known_errorHandler, errorHandler, errors,
2977 &res, &respos))
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002978 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002980 else
2981 /* done with this character => adjust input position */
2982 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983 Py_DECREF(x);
2984 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002986 /* Resize if we allocated to much */
2987 if (respos<PyString_GET_SIZE(res)) {
2988 if (_PyString_Resize(&res, respos))
2989 goto onError;
2990 }
2991 Py_XDECREF(exc);
2992 Py_XDECREF(errorHandler);
2993 return res;
2994
2995 onError:
2996 Py_XDECREF(res);
2997 Py_XDECREF(exc);
2998 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999 return NULL;
3000}
3001
3002PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3003 PyObject *mapping)
3004{
3005 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3006 PyErr_BadArgument();
3007 return NULL;
3008 }
3009 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3010 PyUnicode_GET_SIZE(unicode),
3011 mapping,
3012 NULL);
3013}
3014
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003015/* create or adjust a UnicodeTranslateError */
3016static void make_translate_exception(PyObject **exceptionObject,
3017 const Py_UNICODE *unicode, int size,
3018 int startpos, int endpos,
3019 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003021 if (*exceptionObject == NULL) {
3022 *exceptionObject = PyUnicodeTranslateError_Create(
3023 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 }
3025 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003026 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3027 goto onError;
3028 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3029 goto onError;
3030 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3031 goto onError;
3032 return;
3033 onError:
3034 Py_DECREF(*exceptionObject);
3035 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 }
3037}
3038
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003039/* raises a UnicodeTranslateError */
3040static void raise_translate_exception(PyObject **exceptionObject,
3041 const Py_UNICODE *unicode, int size,
3042 int startpos, int endpos,
3043 const char *reason)
3044{
3045 make_translate_exception(exceptionObject,
3046 unicode, size, startpos, endpos, reason);
3047 if (*exceptionObject != NULL)
3048 PyCodec_StrictErrors(*exceptionObject);
3049}
3050
3051/* error handling callback helper:
3052 build arguments, call the callback and check the arguments,
3053 put the result into newpos and return the replacement string, which
3054 has to be freed by the caller */
3055static PyObject *unicode_translate_call_errorhandler(const char *errors,
3056 PyObject **errorHandler,
3057 const char *reason,
3058 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3059 int startpos, int endpos,
3060 int *newpos)
3061{
3062 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3063
3064 PyObject *restuple;
3065 PyObject *resunicode;
3066
3067 if (*errorHandler == NULL) {
3068 *errorHandler = PyCodec_LookupError(errors);
3069 if (*errorHandler == NULL)
3070 return NULL;
3071 }
3072
3073 make_translate_exception(exceptionObject,
3074 unicode, size, startpos, endpos, reason);
3075 if (*exceptionObject == NULL)
3076 return NULL;
3077
3078 restuple = PyObject_CallFunctionObjArgs(
3079 *errorHandler, *exceptionObject, NULL);
3080 if (restuple == NULL)
3081 return NULL;
3082 if (!PyTuple_Check(restuple)) {
3083 PyErr_Format(PyExc_TypeError, &argparse[4]);
3084 Py_DECREF(restuple);
3085 return NULL;
3086 }
3087 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3088 &resunicode, newpos)) {
3089 Py_DECREF(restuple);
3090 return NULL;
3091 }
3092 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003093 *newpos = size+*newpos;
3094 if (*newpos<0 || *newpos>size) {
3095 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3096 Py_DECREF(restuple);
3097 return NULL;
3098 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003099 Py_INCREF(resunicode);
3100 Py_DECREF(restuple);
3101 return resunicode;
3102}
3103
3104/* Lookup the character ch in the mapping and put the result in result,
3105 which must be decrefed by the caller.
3106 Return 0 on success, -1 on error */
3107static
3108int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3109{
3110 PyObject *w = PyInt_FromLong((long)c);
3111 PyObject *x;
3112
3113 if (w == NULL)
3114 return -1;
3115 x = PyObject_GetItem(mapping, w);
3116 Py_DECREF(w);
3117 if (x == NULL) {
3118 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3119 /* No mapping found means: use 1:1 mapping. */
3120 PyErr_Clear();
3121 *result = NULL;
3122 return 0;
3123 } else
3124 return -1;
3125 }
3126 else if (x == Py_None) {
3127 *result = x;
3128 return 0;
3129 }
3130 else if (PyInt_Check(x)) {
3131 long value = PyInt_AS_LONG(x);
3132 long max = PyUnicode_GetMax();
3133 if (value < 0 || value > max) {
3134 PyErr_Format(PyExc_TypeError,
3135 "character mapping must be in range(0x%lx)", max+1);
3136 Py_DECREF(x);
3137 return -1;
3138 }
3139 *result = x;
3140 return 0;
3141 }
3142 else if (PyUnicode_Check(x)) {
3143 *result = x;
3144 return 0;
3145 }
3146 else {
3147 /* wrong return value */
3148 PyErr_SetString(PyExc_TypeError,
3149 "character mapping must return integer, None or unicode");
3150 return -1;
3151 }
3152}
3153/* ensure that *outobj is at least requiredsize characters long,
3154if not reallocate and adjust various state variables.
3155Return 0 on success, -1 on error */
3156static
3157int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize,
3158 int requiredsize)
3159{
3160 if (requiredsize > *outsize) {
3161 /* remember old output position */
3162 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3163 /* exponentially overallocate to minimize reallocations */
3164 if (requiredsize < 2 * *outsize)
3165 requiredsize = 2 * *outsize;
3166 if (_PyUnicode_Resize(outobj, requiredsize))
3167 return -1;
3168 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3169 *outsize = requiredsize;
3170 }
3171 return 0;
3172}
3173/* lookup the character, put the result in the output string and adjust
3174 various state variables. Return a new reference to the object that
3175 was put in the output buffer in *result, or Py_None, if the mapping was
3176 undefined (in which case no character was written).
3177 The called must decref result.
3178 Return 0 on success, -1 on error. */
3179static
3180int charmaptranslate_output(Py_UNICODE c, PyObject *mapping,
3181 PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res)
3182{
3183 if (charmaptranslate_lookup(c, mapping, res))
3184 return -1;
3185 if (*res==NULL) {
3186 /* not found => default to 1:1 mapping */
3187 *(*outp)++ = (Py_UNICODE)c;
3188 }
3189 else if (*res==Py_None)
3190 ;
3191 else if (PyInt_Check(*res)) {
3192 /* no overflow check, because we know that the space is enough */
3193 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3194 }
3195 else if (PyUnicode_Check(*res)) {
3196 int repsize = PyUnicode_GET_SIZE(*res);
3197 if (repsize==1) {
3198 /* no overflow check, because we know that the space is enough */
3199 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3200 }
3201 else if (repsize!=0) {
3202 /* more than one character */
3203 int requiredsize = *outsize + repsize - 1;
3204 if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize))
3205 return -1;
3206 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3207 *outp += repsize;
3208 }
3209 }
3210 else
3211 return -1;
3212 return 0;
3213}
3214
3215PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003216 int size,
3217 PyObject *mapping,
3218 const char *errors)
3219{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003220 /* output object */
3221 PyObject *res = NULL;
3222 /* pointers to the beginning and end+1 of input */
3223 const Py_UNICODE *startp = p;
3224 const Py_UNICODE *endp = p + size;
3225 /* pointer into the output */
3226 Py_UNICODE *str;
3227 /* current output position */
3228 int respos = 0;
3229 int ressize;
3230 char *reason = "character maps to <undefined>";
3231 PyObject *errorHandler = NULL;
3232 PyObject *exc = NULL;
3233 /* the following variable is used for caching string comparisons
3234 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3235 * 3=ignore, 4=xmlcharrefreplace */
3236 int known_errorHandler = -1;
3237
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238 if (mapping == NULL) {
3239 PyErr_BadArgument();
3240 return NULL;
3241 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003242
3243 /* allocate enough for a simple 1:1 translation without
3244 replacements, if we need more, we'll resize */
3245 res = PyUnicode_FromUnicode(NULL, size);
3246 if (res == NULL)
3247 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003249 return res;
3250 str = PyUnicode_AS_UNICODE(res);
3251 ressize = size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003252
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003253 while (p<endp) {
3254 /* try to encode it */
3255 PyObject *x = NULL;
3256 if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) {
3257 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 goto onError;
3259 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003260 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003261 if (x!=Py_None) /* it worked => adjust input pointer */
3262 ++p;
3263 else { /* untranslatable character */
3264 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3265 int repsize;
3266 int newpos;
3267 Py_UNICODE *uni2;
3268 /* startpos for collecting untranslatable chars */
3269 const Py_UNICODE *collstart = p;
3270 const Py_UNICODE *collend = p+1;
3271 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003273 /* find all untranslatable characters */
3274 while (collend < endp) {
3275 if (charmaptranslate_lookup(*collend, mapping, &x))
3276 goto onError;
3277 Py_XDECREF(x);
3278 if (x!=Py_None)
3279 break;
3280 ++collend;
3281 }
3282 /* cache callback name lookup
3283 * (if not done yet, i.e. it's the first error) */
3284 if (known_errorHandler==-1) {
3285 if ((errors==NULL) || (!strcmp(errors, "strict")))
3286 known_errorHandler = 1;
3287 else if (!strcmp(errors, "replace"))
3288 known_errorHandler = 2;
3289 else if (!strcmp(errors, "ignore"))
3290 known_errorHandler = 3;
3291 else if (!strcmp(errors, "xmlcharrefreplace"))
3292 known_errorHandler = 4;
3293 else
3294 known_errorHandler = 0;
3295 }
3296 switch (known_errorHandler) {
3297 case 1: /* strict */
3298 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3299 goto onError;
3300 case 2: /* replace */
3301 /* No need to check for space, this is a 1:1 replacement */
3302 for (coll = collstart; coll<collend; ++coll)
3303 *str++ = '?';
3304 /* fall through */
3305 case 3: /* ignore */
3306 p = collend;
3307 break;
3308 case 4: /* xmlcharrefreplace */
3309 /* generate replacement (temporarily (mis)uses p) */
3310 for (p = collstart; p < collend; ++p) {
3311 char buffer[2+29+1+1];
3312 char *cp;
3313 sprintf(buffer, "&#%d;", (int)*p);
3314 if (charmaptranslate_makespace(&res, &str, &ressize,
3315 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3316 goto onError;
3317 for (cp = buffer; *cp; ++cp)
3318 *str++ = *cp;
3319 }
3320 p = collend;
3321 break;
3322 default:
3323 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3324 reason, startp, size, &exc,
3325 collstart-startp, collend-startp, &newpos);
3326 if (repunicode == NULL)
3327 goto onError;
3328 /* generate replacement */
3329 repsize = PyUnicode_GET_SIZE(repunicode);
3330 if (charmaptranslate_makespace(&res, &str, &ressize,
3331 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3332 Py_DECREF(repunicode);
3333 goto onError;
3334 }
3335 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3336 *str++ = *uni2;
3337 p = startp + newpos;
3338 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339 }
3340 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342 /* Resize if we allocated to much */
3343 respos = str-PyUnicode_AS_UNICODE(res);
3344 if (respos<ressize) {
3345 if (_PyUnicode_Resize(&res, respos))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003346 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003347 }
3348 Py_XDECREF(exc);
3349 Py_XDECREF(errorHandler);
3350 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003351
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003352 onError:
3353 Py_XDECREF(res);
3354 Py_XDECREF(exc);
3355 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356 return NULL;
3357}
3358
3359PyObject *PyUnicode_Translate(PyObject *str,
3360 PyObject *mapping,
3361 const char *errors)
3362{
3363 PyObject *result;
3364
3365 str = PyUnicode_FromObject(str);
3366 if (str == NULL)
3367 goto onError;
3368 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3369 PyUnicode_GET_SIZE(str),
3370 mapping,
3371 errors);
3372 Py_DECREF(str);
3373 return result;
3374
3375 onError:
3376 Py_XDECREF(str);
3377 return NULL;
3378}
3379
Guido van Rossum9e896b32000-04-05 20:11:21 +00003380/* --- Decimal Encoder ---------------------------------------------------- */
3381
3382int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3383 int length,
3384 char *output,
3385 const char *errors)
3386{
3387 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003388 PyObject *errorHandler = NULL;
3389 PyObject *exc = NULL;
3390 const char *encoding = "decimal";
3391 const char *reason = "invalid decimal Unicode string";
3392 /* the following variable is used for caching string comparisons
3393 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3394 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003395
3396 if (output == NULL) {
3397 PyErr_BadArgument();
3398 return -1;
3399 }
3400
3401 p = s;
3402 end = s + length;
3403 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003404 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003405 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003406 PyObject *repunicode;
3407 int repsize;
3408 int newpos;
3409 Py_UNICODE *uni2;
3410 Py_UNICODE *collstart;
3411 Py_UNICODE *collend;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003412
3413 if (Py_UNICODE_ISSPACE(ch)) {
3414 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003415 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003416 continue;
3417 }
3418 decimal = Py_UNICODE_TODECIMAL(ch);
3419 if (decimal >= 0) {
3420 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003421 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003422 continue;
3423 }
Guido van Rossumba477042000-04-06 18:18:10 +00003424 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003425 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003426 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003427 continue;
3428 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429 /* All other characters are considered unencodable */
3430 collstart = p;
3431 collend = p+1;
3432 while (collend < end) {
3433 if ((0 < *collend && *collend < 256) ||
3434 !Py_UNICODE_ISSPACE(*collend) ||
3435 Py_UNICODE_TODECIMAL(*collend))
3436 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003437 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003438 /* cache callback name lookup
3439 * (if not done yet, i.e. it's the first error) */
3440 if (known_errorHandler==-1) {
3441 if ((errors==NULL) || (!strcmp(errors, "strict")))
3442 known_errorHandler = 1;
3443 else if (!strcmp(errors, "replace"))
3444 known_errorHandler = 2;
3445 else if (!strcmp(errors, "ignore"))
3446 known_errorHandler = 3;
3447 else if (!strcmp(errors, "xmlcharrefreplace"))
3448 known_errorHandler = 4;
3449 else
3450 known_errorHandler = 0;
3451 }
3452 switch (known_errorHandler) {
3453 case 1: /* strict */
3454 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3455 goto onError;
3456 case 2: /* replace */
3457 for (p = collstart; p < collend; ++p)
3458 *output++ = '?';
3459 /* fall through */
3460 case 3: /* ignore */
3461 p = collend;
3462 break;
3463 case 4: /* xmlcharrefreplace */
3464 /* generate replacement (temporarily (mis)uses p) */
3465 for (p = collstart; p < collend; ++p)
3466 output += sprintf(output, "&#%d;", (int)*p);
3467 p = collend;
3468 break;
3469 default:
3470 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3471 encoding, reason, s, length, &exc,
3472 collstart-s, collend-s, &newpos);
3473 if (repunicode == NULL)
3474 goto onError;
3475 /* generate replacement */
3476 repsize = PyUnicode_GET_SIZE(repunicode);
3477 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3478 Py_UNICODE ch = *uni2;
3479 if (Py_UNICODE_ISSPACE(ch))
3480 *output++ = ' ';
3481 else {
3482 decimal = Py_UNICODE_TODECIMAL(ch);
3483 if (decimal >= 0)
3484 *output++ = '0' + decimal;
3485 else if (0 < ch && ch < 256)
3486 *output++ = (char)ch;
3487 else {
3488 Py_DECREF(repunicode);
3489 raise_encode_exception(&exc, encoding,
3490 s, length, collstart-s, collend-s, reason);
3491 goto onError;
3492 }
3493 }
3494 }
3495 p = s + newpos;
3496 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003497 }
3498 }
3499 /* 0-terminate the output string */
3500 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003501 Py_XDECREF(exc);
3502 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003503 return 0;
3504
3505 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506 Py_XDECREF(exc);
3507 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003508 return -1;
3509}
3510
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511/* --- Helpers ------------------------------------------------------------ */
3512
3513static
3514int count(PyUnicodeObject *self,
3515 int start,
3516 int end,
3517 PyUnicodeObject *substring)
3518{
3519 int count = 0;
3520
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003521 if (start < 0)
3522 start += self->length;
3523 if (start < 0)
3524 start = 0;
3525 if (end > self->length)
3526 end = self->length;
3527 if (end < 0)
3528 end += self->length;
3529 if (end < 0)
3530 end = 0;
3531
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003532 if (substring->length == 0)
3533 return (end - start + 1);
3534
Guido van Rossumd57fd912000-03-10 22:53:23 +00003535 end -= substring->length;
3536
3537 while (start <= end)
3538 if (Py_UNICODE_MATCH(self, start, substring)) {
3539 count++;
3540 start += substring->length;
3541 } else
3542 start++;
3543
3544 return count;
3545}
3546
3547int PyUnicode_Count(PyObject *str,
3548 PyObject *substr,
3549 int start,
3550 int end)
3551{
3552 int result;
3553
3554 str = PyUnicode_FromObject(str);
3555 if (str == NULL)
3556 return -1;
3557 substr = PyUnicode_FromObject(substr);
3558 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003559 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003560 return -1;
3561 }
3562
3563 result = count((PyUnicodeObject *)str,
3564 start, end,
3565 (PyUnicodeObject *)substr);
3566
3567 Py_DECREF(str);
3568 Py_DECREF(substr);
3569 return result;
3570}
3571
3572static
3573int findstring(PyUnicodeObject *self,
3574 PyUnicodeObject *substring,
3575 int start,
3576 int end,
3577 int direction)
3578{
3579 if (start < 0)
3580 start += self->length;
3581 if (start < 0)
3582 start = 0;
3583
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584 if (end > self->length)
3585 end = self->length;
3586 if (end < 0)
3587 end += self->length;
3588 if (end < 0)
3589 end = 0;
3590
Guido van Rossum76afbd92002-08-20 17:29:29 +00003591 if (substring->length == 0)
3592 return (direction > 0) ? start : end;
3593
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594 end -= substring->length;
3595
3596 if (direction < 0) {
3597 for (; end >= start; end--)
3598 if (Py_UNICODE_MATCH(self, end, substring))
3599 return end;
3600 } else {
3601 for (; start <= end; start++)
3602 if (Py_UNICODE_MATCH(self, start, substring))
3603 return start;
3604 }
3605
3606 return -1;
3607}
3608
3609int PyUnicode_Find(PyObject *str,
3610 PyObject *substr,
3611 int start,
3612 int end,
3613 int direction)
3614{
3615 int result;
3616
3617 str = PyUnicode_FromObject(str);
3618 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003619 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620 substr = PyUnicode_FromObject(substr);
3621 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003622 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003623 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003624 }
3625
3626 result = findstring((PyUnicodeObject *)str,
3627 (PyUnicodeObject *)substr,
3628 start, end, direction);
3629 Py_DECREF(str);
3630 Py_DECREF(substr);
3631 return result;
3632}
3633
3634static
3635int tailmatch(PyUnicodeObject *self,
3636 PyUnicodeObject *substring,
3637 int start,
3638 int end,
3639 int direction)
3640{
3641 if (start < 0)
3642 start += self->length;
3643 if (start < 0)
3644 start = 0;
3645
3646 if (substring->length == 0)
3647 return 1;
3648
3649 if (end > self->length)
3650 end = self->length;
3651 if (end < 0)
3652 end += self->length;
3653 if (end < 0)
3654 end = 0;
3655
3656 end -= substring->length;
3657 if (end < start)
3658 return 0;
3659
3660 if (direction > 0) {
3661 if (Py_UNICODE_MATCH(self, end, substring))
3662 return 1;
3663 } else {
3664 if (Py_UNICODE_MATCH(self, start, substring))
3665 return 1;
3666 }
3667
3668 return 0;
3669}
3670
3671int PyUnicode_Tailmatch(PyObject *str,
3672 PyObject *substr,
3673 int start,
3674 int end,
3675 int direction)
3676{
3677 int result;
3678
3679 str = PyUnicode_FromObject(str);
3680 if (str == NULL)
3681 return -1;
3682 substr = PyUnicode_FromObject(substr);
3683 if (substr == NULL) {
3684 Py_DECREF(substr);
3685 return -1;
3686 }
3687
3688 result = tailmatch((PyUnicodeObject *)str,
3689 (PyUnicodeObject *)substr,
3690 start, end, direction);
3691 Py_DECREF(str);
3692 Py_DECREF(substr);
3693 return result;
3694}
3695
3696static
3697const Py_UNICODE *findchar(const Py_UNICODE *s,
3698 int size,
3699 Py_UNICODE ch)
3700{
3701 /* like wcschr, but doesn't stop at NULL characters */
3702
3703 while (size-- > 0) {
3704 if (*s == ch)
3705 return s;
3706 s++;
3707 }
3708
3709 return NULL;
3710}
3711
3712/* Apply fixfct filter to the Unicode object self and return a
3713 reference to the modified object */
3714
3715static
3716PyObject *fixup(PyUnicodeObject *self,
3717 int (*fixfct)(PyUnicodeObject *s))
3718{
3719
3720 PyUnicodeObject *u;
3721
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003722 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003723 if (u == NULL)
3724 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003725
3726 Py_UNICODE_COPY(u->str, self->str, self->length);
3727
Tim Peters7a29bd52001-09-12 03:03:31 +00003728 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003729 /* fixfct should return TRUE if it modified the buffer. If
3730 FALSE, return a reference to the original buffer instead
3731 (to save space, not time) */
3732 Py_INCREF(self);
3733 Py_DECREF(u);
3734 return (PyObject*) self;
3735 }
3736 return (PyObject*) u;
3737}
3738
3739static
3740int fixupper(PyUnicodeObject *self)
3741{
3742 int len = self->length;
3743 Py_UNICODE *s = self->str;
3744 int status = 0;
3745
3746 while (len-- > 0) {
3747 register Py_UNICODE ch;
3748
3749 ch = Py_UNICODE_TOUPPER(*s);
3750 if (ch != *s) {
3751 status = 1;
3752 *s = ch;
3753 }
3754 s++;
3755 }
3756
3757 return status;
3758}
3759
3760static
3761int fixlower(PyUnicodeObject *self)
3762{
3763 int len = self->length;
3764 Py_UNICODE *s = self->str;
3765 int status = 0;
3766
3767 while (len-- > 0) {
3768 register Py_UNICODE ch;
3769
3770 ch = Py_UNICODE_TOLOWER(*s);
3771 if (ch != *s) {
3772 status = 1;
3773 *s = ch;
3774 }
3775 s++;
3776 }
3777
3778 return status;
3779}
3780
3781static
3782int fixswapcase(PyUnicodeObject *self)
3783{
3784 int len = self->length;
3785 Py_UNICODE *s = self->str;
3786 int status = 0;
3787
3788 while (len-- > 0) {
3789 if (Py_UNICODE_ISUPPER(*s)) {
3790 *s = Py_UNICODE_TOLOWER(*s);
3791 status = 1;
3792 } else if (Py_UNICODE_ISLOWER(*s)) {
3793 *s = Py_UNICODE_TOUPPER(*s);
3794 status = 1;
3795 }
3796 s++;
3797 }
3798
3799 return status;
3800}
3801
3802static
3803int fixcapitalize(PyUnicodeObject *self)
3804{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003805 int len = self->length;
3806 Py_UNICODE *s = self->str;
3807 int status = 0;
3808
3809 if (len == 0)
3810 return 0;
3811 if (Py_UNICODE_ISLOWER(*s)) {
3812 *s = Py_UNICODE_TOUPPER(*s);
3813 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003815 s++;
3816 while (--len > 0) {
3817 if (Py_UNICODE_ISUPPER(*s)) {
3818 *s = Py_UNICODE_TOLOWER(*s);
3819 status = 1;
3820 }
3821 s++;
3822 }
3823 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824}
3825
3826static
3827int fixtitle(PyUnicodeObject *self)
3828{
3829 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3830 register Py_UNICODE *e;
3831 int previous_is_cased;
3832
3833 /* Shortcut for single character strings */
3834 if (PyUnicode_GET_SIZE(self) == 1) {
3835 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3836 if (*p != ch) {
3837 *p = ch;
3838 return 1;
3839 }
3840 else
3841 return 0;
3842 }
3843
3844 e = p + PyUnicode_GET_SIZE(self);
3845 previous_is_cased = 0;
3846 for (; p < e; p++) {
3847 register const Py_UNICODE ch = *p;
3848
3849 if (previous_is_cased)
3850 *p = Py_UNICODE_TOLOWER(ch);
3851 else
3852 *p = Py_UNICODE_TOTITLE(ch);
3853
3854 if (Py_UNICODE_ISLOWER(ch) ||
3855 Py_UNICODE_ISUPPER(ch) ||
3856 Py_UNICODE_ISTITLE(ch))
3857 previous_is_cased = 1;
3858 else
3859 previous_is_cased = 0;
3860 }
3861 return 1;
3862}
3863
3864PyObject *PyUnicode_Join(PyObject *separator,
3865 PyObject *seq)
3866{
3867 Py_UNICODE *sep;
3868 int seplen;
3869 PyUnicodeObject *res = NULL;
3870 int reslen = 0;
3871 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872 int sz = 100;
3873 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003874 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875
Tim Peters2cfe3682001-05-05 05:36:48 +00003876 it = PyObject_GetIter(seq);
3877 if (it == NULL)
3878 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003879
3880 if (separator == NULL) {
3881 Py_UNICODE blank = ' ';
3882 sep = &blank;
3883 seplen = 1;
3884 }
3885 else {
3886 separator = PyUnicode_FromObject(separator);
3887 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003888 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003889 sep = PyUnicode_AS_UNICODE(separator);
3890 seplen = PyUnicode_GET_SIZE(separator);
3891 }
3892
3893 res = _PyUnicode_New(sz);
3894 if (res == NULL)
3895 goto onError;
3896 p = PyUnicode_AS_UNICODE(res);
3897 reslen = 0;
3898
Tim Peters2cfe3682001-05-05 05:36:48 +00003899 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003900 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003901 PyObject *item = PyIter_Next(it);
3902 if (item == NULL) {
3903 if (PyErr_Occurred())
3904 goto onError;
3905 break;
3906 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003907 if (!PyUnicode_Check(item)) {
3908 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003909 if (!PyString_Check(item)) {
3910 PyErr_Format(PyExc_TypeError,
3911 "sequence item %i: expected string or Unicode,"
3912 " %.80s found",
3913 i, item->ob_type->tp_name);
3914 Py_DECREF(item);
3915 goto onError;
3916 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917 v = PyUnicode_FromObject(item);
3918 Py_DECREF(item);
3919 item = v;
3920 if (item == NULL)
3921 goto onError;
3922 }
3923 itemlen = PyUnicode_GET_SIZE(item);
3924 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003925 if (_PyUnicode_Resize(&res, sz*2)) {
3926 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003927 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003928 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003929 sz *= 2;
3930 p = PyUnicode_AS_UNICODE(res) + reslen;
3931 }
3932 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003933 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003934 p += seplen;
3935 reslen += seplen;
3936 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003937 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938 p += itemlen;
3939 reslen += itemlen;
3940 Py_DECREF(item);
3941 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003942 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943 goto onError;
3944
3945 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003946 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947 return (PyObject *)res;
3948
3949 onError:
3950 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003951 Py_XDECREF(res);
3952 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953 return NULL;
3954}
3955
3956static
3957PyUnicodeObject *pad(PyUnicodeObject *self,
3958 int left,
3959 int right,
3960 Py_UNICODE fill)
3961{
3962 PyUnicodeObject *u;
3963
3964 if (left < 0)
3965 left = 0;
3966 if (right < 0)
3967 right = 0;
3968
Tim Peters7a29bd52001-09-12 03:03:31 +00003969 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970 Py_INCREF(self);
3971 return self;
3972 }
3973
3974 u = _PyUnicode_New(left + self->length + right);
3975 if (u) {
3976 if (left)
3977 Py_UNICODE_FILL(u->str, fill, left);
3978 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3979 if (right)
3980 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3981 }
3982
3983 return u;
3984}
3985
3986#define SPLIT_APPEND(data, left, right) \
3987 str = PyUnicode_FromUnicode(data + left, right - left); \
3988 if (!str) \
3989 goto onError; \
3990 if (PyList_Append(list, str)) { \
3991 Py_DECREF(str); \
3992 goto onError; \
3993 } \
3994 else \
3995 Py_DECREF(str);
3996
3997static
3998PyObject *split_whitespace(PyUnicodeObject *self,
3999 PyObject *list,
4000 int maxcount)
4001{
4002 register int i;
4003 register int j;
4004 int len = self->length;
4005 PyObject *str;
4006
4007 for (i = j = 0; i < len; ) {
4008 /* find a token */
4009 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4010 i++;
4011 j = i;
4012 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4013 i++;
4014 if (j < i) {
4015 if (maxcount-- <= 0)
4016 break;
4017 SPLIT_APPEND(self->str, j, i);
4018 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4019 i++;
4020 j = i;
4021 }
4022 }
4023 if (j < len) {
4024 SPLIT_APPEND(self->str, j, len);
4025 }
4026 return list;
4027
4028 onError:
4029 Py_DECREF(list);
4030 return NULL;
4031}
4032
4033PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004034 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035{
4036 register int i;
4037 register int j;
4038 int len;
4039 PyObject *list;
4040 PyObject *str;
4041 Py_UNICODE *data;
4042
4043 string = PyUnicode_FromObject(string);
4044 if (string == NULL)
4045 return NULL;
4046 data = PyUnicode_AS_UNICODE(string);
4047 len = PyUnicode_GET_SIZE(string);
4048
Guido van Rossumd57fd912000-03-10 22:53:23 +00004049 list = PyList_New(0);
4050 if (!list)
4051 goto onError;
4052
4053 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004054 int eol;
4055
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056 /* Find a line and append it */
4057 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4058 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059
4060 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004061 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062 if (i < len) {
4063 if (data[i] == '\r' && i + 1 < len &&
4064 data[i+1] == '\n')
4065 i += 2;
4066 else
4067 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004068 if (keepends)
4069 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 }
Guido van Rossum86662912000-04-11 15:38:46 +00004071 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 j = i;
4073 }
4074 if (j < len) {
4075 SPLIT_APPEND(data, j, len);
4076 }
4077
4078 Py_DECREF(string);
4079 return list;
4080
4081 onError:
4082 Py_DECREF(list);
4083 Py_DECREF(string);
4084 return NULL;
4085}
4086
4087static
4088PyObject *split_char(PyUnicodeObject *self,
4089 PyObject *list,
4090 Py_UNICODE ch,
4091 int maxcount)
4092{
4093 register int i;
4094 register int j;
4095 int len = self->length;
4096 PyObject *str;
4097
4098 for (i = j = 0; i < len; ) {
4099 if (self->str[i] == ch) {
4100 if (maxcount-- <= 0)
4101 break;
4102 SPLIT_APPEND(self->str, j, i);
4103 i = j = i + 1;
4104 } else
4105 i++;
4106 }
4107 if (j <= len) {
4108 SPLIT_APPEND(self->str, j, len);
4109 }
4110 return list;
4111
4112 onError:
4113 Py_DECREF(list);
4114 return NULL;
4115}
4116
4117static
4118PyObject *split_substring(PyUnicodeObject *self,
4119 PyObject *list,
4120 PyUnicodeObject *substring,
4121 int maxcount)
4122{
4123 register int i;
4124 register int j;
4125 int len = self->length;
4126 int sublen = substring->length;
4127 PyObject *str;
4128
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004129 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130 if (Py_UNICODE_MATCH(self, i, substring)) {
4131 if (maxcount-- <= 0)
4132 break;
4133 SPLIT_APPEND(self->str, j, i);
4134 i = j = i + sublen;
4135 } else
4136 i++;
4137 }
4138 if (j <= len) {
4139 SPLIT_APPEND(self->str, j, len);
4140 }
4141 return list;
4142
4143 onError:
4144 Py_DECREF(list);
4145 return NULL;
4146}
4147
4148#undef SPLIT_APPEND
4149
4150static
4151PyObject *split(PyUnicodeObject *self,
4152 PyUnicodeObject *substring,
4153 int maxcount)
4154{
4155 PyObject *list;
4156
4157 if (maxcount < 0)
4158 maxcount = INT_MAX;
4159
4160 list = PyList_New(0);
4161 if (!list)
4162 return NULL;
4163
4164 if (substring == NULL)
4165 return split_whitespace(self,list,maxcount);
4166
4167 else if (substring->length == 1)
4168 return split_char(self,list,substring->str[0],maxcount);
4169
4170 else if (substring->length == 0) {
4171 Py_DECREF(list);
4172 PyErr_SetString(PyExc_ValueError, "empty separator");
4173 return NULL;
4174 }
4175 else
4176 return split_substring(self,list,substring,maxcount);
4177}
4178
4179static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180PyObject *replace(PyUnicodeObject *self,
4181 PyUnicodeObject *str1,
4182 PyUnicodeObject *str2,
4183 int maxcount)
4184{
4185 PyUnicodeObject *u;
4186
4187 if (maxcount < 0)
4188 maxcount = INT_MAX;
4189
4190 if (str1->length == 1 && str2->length == 1) {
4191 int i;
4192
4193 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004194 if (!findchar(self->str, self->length, str1->str[0]) &&
4195 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004196 /* nothing to replace, return original string */
4197 Py_INCREF(self);
4198 u = self;
4199 } else {
4200 Py_UNICODE u1 = str1->str[0];
4201 Py_UNICODE u2 = str2->str[0];
4202
4203 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004204 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205 self->length
4206 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004207 if (u != NULL) {
4208 Py_UNICODE_COPY(u->str, self->str,
4209 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004210 for (i = 0; i < u->length; i++)
4211 if (u->str[i] == u1) {
4212 if (--maxcount < 0)
4213 break;
4214 u->str[i] = u2;
4215 }
4216 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004217 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218
4219 } else {
4220 int n, i;
4221 Py_UNICODE *p;
4222
4223 /* replace strings */
4224 n = count(self, 0, self->length, str1);
4225 if (n > maxcount)
4226 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004227 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004229 if (PyUnicode_CheckExact(self)) {
4230 Py_INCREF(self);
4231 u = self;
4232 }
4233 else {
4234 u = (PyUnicodeObject *)
4235 PyUnicode_FromUnicode(self->str, self->length);
4236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004237 } else {
4238 u = _PyUnicode_New(
4239 self->length + n * (str2->length - str1->length));
4240 if (u) {
4241 i = 0;
4242 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004243 if (str1->length > 0) {
4244 while (i <= self->length - str1->length)
4245 if (Py_UNICODE_MATCH(self, i, str1)) {
4246 /* replace string segment */
4247 Py_UNICODE_COPY(p, str2->str, str2->length);
4248 p += str2->length;
4249 i += str1->length;
4250 if (--n <= 0) {
4251 /* copy remaining part */
4252 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4253 break;
4254 }
4255 } else
4256 *p++ = self->str[i++];
4257 } else {
4258 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004259 Py_UNICODE_COPY(p, str2->str, str2->length);
4260 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004261 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004262 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004263 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004264 }
4265 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4266 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004267 }
4268 }
4269 }
4270
4271 return (PyObject *) u;
4272}
4273
4274/* --- Unicode Object Methods --------------------------------------------- */
4275
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004276PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277"S.title() -> unicode\n\
4278\n\
4279Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004280characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004281
4282static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004283unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004285 return fixup(self, fixtitle);
4286}
4287
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004288PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004289"S.capitalize() -> unicode\n\
4290\n\
4291Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004292have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293
4294static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004295unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004296{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 return fixup(self, fixcapitalize);
4298}
4299
4300#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004301PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004302"S.capwords() -> unicode\n\
4303\n\
4304Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004305normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306
4307static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004308unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309{
4310 PyObject *list;
4311 PyObject *item;
4312 int i;
4313
Guido van Rossumd57fd912000-03-10 22:53:23 +00004314 /* Split into words */
4315 list = split(self, NULL, -1);
4316 if (!list)
4317 return NULL;
4318
4319 /* Capitalize each word */
4320 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4321 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4322 fixcapitalize);
4323 if (item == NULL)
4324 goto onError;
4325 Py_DECREF(PyList_GET_ITEM(list, i));
4326 PyList_SET_ITEM(list, i, item);
4327 }
4328
4329 /* Join the words to form a new string */
4330 item = PyUnicode_Join(NULL, list);
4331
4332onError:
4333 Py_DECREF(list);
4334 return (PyObject *)item;
4335}
4336#endif
4337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004338PyDoc_STRVAR(center__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339"S.center(width) -> unicode\n\
4340\n\
4341Return S centered in a Unicode string of length width. Padding is done\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004342using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343
4344static PyObject *
4345unicode_center(PyUnicodeObject *self, PyObject *args)
4346{
4347 int marg, left;
4348 int width;
4349
4350 if (!PyArg_ParseTuple(args, "i:center", &width))
4351 return NULL;
4352
Tim Peters7a29bd52001-09-12 03:03:31 +00004353 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 Py_INCREF(self);
4355 return (PyObject*) self;
4356 }
4357
4358 marg = width - self->length;
4359 left = marg / 2 + (marg & width & 1);
4360
4361 return (PyObject*) pad(self, left, marg - left, ' ');
4362}
4363
Marc-André Lemburge5034372000-08-08 08:04:29 +00004364#if 0
4365
4366/* This code should go into some future Unicode collation support
4367 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004368 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004369
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004370/* speedy UTF-16 code point order comparison */
4371/* gleaned from: */
4372/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4373
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004374static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004375{
4376 0, 0, 0, 0, 0, 0, 0, 0,
4377 0, 0, 0, 0, 0, 0, 0, 0,
4378 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004379 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004380};
4381
Guido van Rossumd57fd912000-03-10 22:53:23 +00004382static int
4383unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4384{
4385 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004386
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387 Py_UNICODE *s1 = str1->str;
4388 Py_UNICODE *s2 = str2->str;
4389
4390 len1 = str1->length;
4391 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004392
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004394 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004395
4396 c1 = *s1++;
4397 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004398
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004399 if (c1 > (1<<11) * 26)
4400 c1 += utf16Fixup[c1>>11];
4401 if (c2 > (1<<11) * 26)
4402 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004403 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004404
4405 if (c1 != c2)
4406 return (c1 < c2) ? -1 : 1;
4407
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004408 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409 }
4410
4411 return (len1 < len2) ? -1 : (len1 != len2);
4412}
4413
Marc-André Lemburge5034372000-08-08 08:04:29 +00004414#else
4415
4416static int
4417unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4418{
4419 register int len1, len2;
4420
4421 Py_UNICODE *s1 = str1->str;
4422 Py_UNICODE *s2 = str2->str;
4423
4424 len1 = str1->length;
4425 len2 = str2->length;
4426
4427 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00004428 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004429
Fredrik Lundh45714e92001-06-26 16:39:36 +00004430 c1 = *s1++;
4431 c2 = *s2++;
4432
4433 if (c1 != c2)
4434 return (c1 < c2) ? -1 : 1;
4435
Marc-André Lemburge5034372000-08-08 08:04:29 +00004436 len1--; len2--;
4437 }
4438
4439 return (len1 < len2) ? -1 : (len1 != len2);
4440}
4441
4442#endif
4443
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444int PyUnicode_Compare(PyObject *left,
4445 PyObject *right)
4446{
4447 PyUnicodeObject *u = NULL, *v = NULL;
4448 int result;
4449
4450 /* Coerce the two arguments */
4451 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4452 if (u == NULL)
4453 goto onError;
4454 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4455 if (v == NULL)
4456 goto onError;
4457
Thomas Wouters7e474022000-07-16 12:04:32 +00004458 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459 if (v == u) {
4460 Py_DECREF(u);
4461 Py_DECREF(v);
4462 return 0;
4463 }
4464
4465 result = unicode_compare(u, v);
4466
4467 Py_DECREF(u);
4468 Py_DECREF(v);
4469 return result;
4470
4471onError:
4472 Py_XDECREF(u);
4473 Py_XDECREF(v);
4474 return -1;
4475}
4476
Guido van Rossum403d68b2000-03-13 15:55:09 +00004477int PyUnicode_Contains(PyObject *container,
4478 PyObject *element)
4479{
4480 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004481 int result, size;
4482 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004483
4484 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004485 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004486 if (v == NULL) {
4487 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004488 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004489 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004490 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004491 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004492 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004493 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004494
Barry Warsaw817918c2002-08-06 16:58:21 +00004495 size = PyUnicode_GET_SIZE(v);
4496 rhs = PyUnicode_AS_UNICODE(v);
4497 lhs = PyUnicode_AS_UNICODE(u);
4498
Guido van Rossum403d68b2000-03-13 15:55:09 +00004499 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004500 if (size == 1) {
4501 end = lhs + PyUnicode_GET_SIZE(u);
4502 while (lhs < end) {
4503 if (*lhs++ == *rhs) {
4504 result = 1;
4505 break;
4506 }
4507 }
4508 }
4509 else {
4510 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4511 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004512 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004513 result = 1;
4514 break;
4515 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004516 }
4517 }
4518
4519 Py_DECREF(u);
4520 Py_DECREF(v);
4521 return result;
4522
4523onError:
4524 Py_XDECREF(u);
4525 Py_XDECREF(v);
4526 return -1;
4527}
4528
Guido van Rossumd57fd912000-03-10 22:53:23 +00004529/* Concat to string or Unicode object giving a new Unicode object. */
4530
4531PyObject *PyUnicode_Concat(PyObject *left,
4532 PyObject *right)
4533{
4534 PyUnicodeObject *u = NULL, *v = NULL, *w;
4535
4536 /* Coerce the two arguments */
4537 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4538 if (u == NULL)
4539 goto onError;
4540 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4541 if (v == NULL)
4542 goto onError;
4543
4544 /* Shortcuts */
4545 if (v == unicode_empty) {
4546 Py_DECREF(v);
4547 return (PyObject *)u;
4548 }
4549 if (u == unicode_empty) {
4550 Py_DECREF(u);
4551 return (PyObject *)v;
4552 }
4553
4554 /* Concat the two Unicode strings */
4555 w = _PyUnicode_New(u->length + v->length);
4556 if (w == NULL)
4557 goto onError;
4558 Py_UNICODE_COPY(w->str, u->str, u->length);
4559 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4560
4561 Py_DECREF(u);
4562 Py_DECREF(v);
4563 return (PyObject *)w;
4564
4565onError:
4566 Py_XDECREF(u);
4567 Py_XDECREF(v);
4568 return NULL;
4569}
4570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004571PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004572"S.count(sub[, start[, end]]) -> int\n\
4573\n\
4574Return the number of occurrences of substring sub in Unicode string\n\
4575S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004576interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004577
4578static PyObject *
4579unicode_count(PyUnicodeObject *self, PyObject *args)
4580{
4581 PyUnicodeObject *substring;
4582 int start = 0;
4583 int end = INT_MAX;
4584 PyObject *result;
4585
Guido van Rossumb8872e62000-05-09 14:14:27 +00004586 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4587 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588 return NULL;
4589
4590 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4591 (PyObject *)substring);
4592 if (substring == NULL)
4593 return NULL;
4594
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595 if (start < 0)
4596 start += self->length;
4597 if (start < 0)
4598 start = 0;
4599 if (end > self->length)
4600 end = self->length;
4601 if (end < 0)
4602 end += self->length;
4603 if (end < 0)
4604 end = 0;
4605
4606 result = PyInt_FromLong((long) count(self, start, end, substring));
4607
4608 Py_DECREF(substring);
4609 return result;
4610}
4611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004612PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004613"S.encode([encoding[,errors]]) -> string\n\
4614\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004615Return an encoded string version of S. Default encoding is the current\n\
4616default string encoding. errors may be given to set a different error\n\
4617handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4619'xmlcharrefreplace' as well as any other name registered with\n\
4620codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004621
4622static PyObject *
4623unicode_encode(PyUnicodeObject *self, PyObject *args)
4624{
4625 char *encoding = NULL;
4626 char *errors = NULL;
4627 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4628 return NULL;
4629 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4630}
4631
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004632PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004633"S.expandtabs([tabsize]) -> unicode\n\
4634\n\
4635Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004636If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637
4638static PyObject*
4639unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4640{
4641 Py_UNICODE *e;
4642 Py_UNICODE *p;
4643 Py_UNICODE *q;
4644 int i, j;
4645 PyUnicodeObject *u;
4646 int tabsize = 8;
4647
4648 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4649 return NULL;
4650
Thomas Wouters7e474022000-07-16 12:04:32 +00004651 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004652 i = j = 0;
4653 e = self->str + self->length;
4654 for (p = self->str; p < e; p++)
4655 if (*p == '\t') {
4656 if (tabsize > 0)
4657 j += tabsize - (j % tabsize);
4658 }
4659 else {
4660 j++;
4661 if (*p == '\n' || *p == '\r') {
4662 i += j;
4663 j = 0;
4664 }
4665 }
4666
4667 /* Second pass: create output string and fill it */
4668 u = _PyUnicode_New(i + j);
4669 if (!u)
4670 return NULL;
4671
4672 j = 0;
4673 q = u->str;
4674
4675 for (p = self->str; p < e; p++)
4676 if (*p == '\t') {
4677 if (tabsize > 0) {
4678 i = tabsize - (j % tabsize);
4679 j += i;
4680 while (i--)
4681 *q++ = ' ';
4682 }
4683 }
4684 else {
4685 j++;
4686 *q++ = *p;
4687 if (*p == '\n' || *p == '\r')
4688 j = 0;
4689 }
4690
4691 return (PyObject*) u;
4692}
4693
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004694PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004695"S.find(sub [,start [,end]]) -> int\n\
4696\n\
4697Return the lowest index in S where substring sub is found,\n\
4698such that sub is contained within s[start,end]. Optional\n\
4699arguments start and end are interpreted as in slice notation.\n\
4700\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004701Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702
4703static PyObject *
4704unicode_find(PyUnicodeObject *self, PyObject *args)
4705{
4706 PyUnicodeObject *substring;
4707 int start = 0;
4708 int end = INT_MAX;
4709 PyObject *result;
4710
Guido van Rossumb8872e62000-05-09 14:14:27 +00004711 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4712 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713 return NULL;
4714 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4715 (PyObject *)substring);
4716 if (substring == NULL)
4717 return NULL;
4718
4719 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4720
4721 Py_DECREF(substring);
4722 return result;
4723}
4724
4725static PyObject *
4726unicode_getitem(PyUnicodeObject *self, int index)
4727{
4728 if (index < 0 || index >= self->length) {
4729 PyErr_SetString(PyExc_IndexError, "string index out of range");
4730 return NULL;
4731 }
4732
4733 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4734}
4735
4736static long
4737unicode_hash(PyUnicodeObject *self)
4738{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004739 /* Since Unicode objects compare equal to their ASCII string
4740 counterparts, they should use the individual character values
4741 as basis for their hash value. This is needed to assure that
4742 strings and Unicode objects behave in the same way as
4743 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744
Fredrik Lundhdde61642000-07-10 18:27:47 +00004745 register int len;
4746 register Py_UNICODE *p;
4747 register long x;
4748
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749 if (self->hash != -1)
4750 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004751 len = PyUnicode_GET_SIZE(self);
4752 p = PyUnicode_AS_UNICODE(self);
4753 x = *p << 7;
4754 while (--len >= 0)
4755 x = (1000003*x) ^ *p++;
4756 x ^= PyUnicode_GET_SIZE(self);
4757 if (x == -1)
4758 x = -2;
4759 self->hash = x;
4760 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761}
4762
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004763PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764"S.index(sub [,start [,end]]) -> int\n\
4765\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004766Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767
4768static PyObject *
4769unicode_index(PyUnicodeObject *self, PyObject *args)
4770{
4771 int result;
4772 PyUnicodeObject *substring;
4773 int start = 0;
4774 int end = INT_MAX;
4775
Guido van Rossumb8872e62000-05-09 14:14:27 +00004776 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4777 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778 return NULL;
4779
4780 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4781 (PyObject *)substring);
4782 if (substring == NULL)
4783 return NULL;
4784
4785 result = findstring(self, substring, start, end, 1);
4786
4787 Py_DECREF(substring);
4788 if (result < 0) {
4789 PyErr_SetString(PyExc_ValueError, "substring not found");
4790 return NULL;
4791 }
4792 return PyInt_FromLong(result);
4793}
4794
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004795PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004796"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004798Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004799at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800
4801static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004802unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803{
4804 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4805 register const Py_UNICODE *e;
4806 int cased;
4807
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808 /* Shortcut for single character strings */
4809 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004810 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004812 /* Special case for empty strings */
4813 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004814 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004815
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816 e = p + PyUnicode_GET_SIZE(self);
4817 cased = 0;
4818 for (; p < e; p++) {
4819 register const Py_UNICODE ch = *p;
4820
4821 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004822 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823 else if (!cased && Py_UNICODE_ISLOWER(ch))
4824 cased = 1;
4825 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004826 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827}
4828
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004829PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004830"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004832Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004833at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834
4835static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004836unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837{
4838 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4839 register const Py_UNICODE *e;
4840 int cased;
4841
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 /* Shortcut for single character strings */
4843 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004844 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004846 /* Special case for empty strings */
4847 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004848 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004849
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850 e = p + PyUnicode_GET_SIZE(self);
4851 cased = 0;
4852 for (; p < e; p++) {
4853 register const Py_UNICODE ch = *p;
4854
4855 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004856 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 else if (!cased && Py_UNICODE_ISUPPER(ch))
4858 cased = 1;
4859 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004860 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861}
4862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004863PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004864"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004866Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4867characters may only follow uncased characters and lowercase characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004868only cased ones. Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869
4870static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004871unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872{
4873 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4874 register const Py_UNICODE *e;
4875 int cased, previous_is_cased;
4876
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 /* Shortcut for single character strings */
4878 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004879 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4880 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004881
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004882 /* Special case for empty strings */
4883 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004884 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004885
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886 e = p + PyUnicode_GET_SIZE(self);
4887 cased = 0;
4888 previous_is_cased = 0;
4889 for (; p < e; p++) {
4890 register const Py_UNICODE ch = *p;
4891
4892 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4893 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004894 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895 previous_is_cased = 1;
4896 cased = 1;
4897 }
4898 else if (Py_UNICODE_ISLOWER(ch)) {
4899 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004900 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 previous_is_cased = 1;
4902 cased = 1;
4903 }
4904 else
4905 previous_is_cased = 0;
4906 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004907 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908}
4909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004910PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004911"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004913Return True if there are only whitespace characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004914False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915
4916static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004917unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918{
4919 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4920 register const Py_UNICODE *e;
4921
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922 /* Shortcut for single character strings */
4923 if (PyUnicode_GET_SIZE(self) == 1 &&
4924 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004925 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004927 /* Special case for empty strings */
4928 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004929 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004930
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931 e = p + PyUnicode_GET_SIZE(self);
4932 for (; p < e; p++) {
4933 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004934 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004936 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937}
4938
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004939PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004940"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004941\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004942Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004943and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004944
4945static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004946unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004947{
4948 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4949 register const Py_UNICODE *e;
4950
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004951 /* Shortcut for single character strings */
4952 if (PyUnicode_GET_SIZE(self) == 1 &&
4953 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004954 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004955
4956 /* Special case for empty strings */
4957 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004958 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004959
4960 e = p + PyUnicode_GET_SIZE(self);
4961 for (; p < e; p++) {
4962 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004963 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004964 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004965 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004966}
4967
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004968PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004969"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004970\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004971Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004972and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004973
4974static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004975unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004976{
4977 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4978 register const Py_UNICODE *e;
4979
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004980 /* Shortcut for single character strings */
4981 if (PyUnicode_GET_SIZE(self) == 1 &&
4982 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004983 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004984
4985 /* Special case for empty strings */
4986 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004987 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004988
4989 e = p + PyUnicode_GET_SIZE(self);
4990 for (; p < e; p++) {
4991 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004992 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004993 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004994 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004995}
4996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004997PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004998"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004999\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005000Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005001False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005002
5003static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005004unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005005{
5006 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5007 register const Py_UNICODE *e;
5008
Guido van Rossumd57fd912000-03-10 22:53:23 +00005009 /* Shortcut for single character strings */
5010 if (PyUnicode_GET_SIZE(self) == 1 &&
5011 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005012 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005014 /* Special case for empty strings */
5015 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005016 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005017
Guido van Rossumd57fd912000-03-10 22:53:23 +00005018 e = p + PyUnicode_GET_SIZE(self);
5019 for (; p < e; p++) {
5020 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005021 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005022 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005023 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024}
5025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005026PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005027"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005029Return True if there are only digit characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005030False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005031
5032static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005033unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005034{
5035 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5036 register const Py_UNICODE *e;
5037
Guido van Rossumd57fd912000-03-10 22:53:23 +00005038 /* Shortcut for single character strings */
5039 if (PyUnicode_GET_SIZE(self) == 1 &&
5040 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005041 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005043 /* Special case for empty strings */
5044 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005045 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005046
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047 e = p + PyUnicode_GET_SIZE(self);
5048 for (; p < e; p++) {
5049 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005050 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005051 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005052 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053}
5054
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005055PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005056"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005058Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005059False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060
5061static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005062unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063{
5064 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5065 register const Py_UNICODE *e;
5066
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067 /* Shortcut for single character strings */
5068 if (PyUnicode_GET_SIZE(self) == 1 &&
5069 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005070 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005072 /* Special case for empty strings */
5073 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005074 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005075
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076 e = p + PyUnicode_GET_SIZE(self);
5077 for (; p < e; p++) {
5078 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005079 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005081 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082}
5083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005084PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085"S.join(sequence) -> unicode\n\
5086\n\
5087Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005088sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089
5090static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005091unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005093 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005094}
5095
5096static int
5097unicode_length(PyUnicodeObject *self)
5098{
5099 return self->length;
5100}
5101
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005102PyDoc_STRVAR(ljust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103"S.ljust(width) -> unicode\n\
5104\n\
5105Return S left justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005106done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107
5108static PyObject *
5109unicode_ljust(PyUnicodeObject *self, PyObject *args)
5110{
5111 int width;
5112 if (!PyArg_ParseTuple(args, "i:ljust", &width))
5113 return NULL;
5114
Tim Peters7a29bd52001-09-12 03:03:31 +00005115 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116 Py_INCREF(self);
5117 return (PyObject*) self;
5118 }
5119
5120 return (PyObject*) pad(self, 0, width - self->length, ' ');
5121}
5122
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005123PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124"S.lower() -> unicode\n\
5125\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005126Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127
5128static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005129unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131 return fixup(self, fixlower);
5132}
5133
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005134#define LEFTSTRIP 0
5135#define RIGHTSTRIP 1
5136#define BOTHSTRIP 2
5137
5138/* Arrays indexed by above */
5139static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5140
5141#define STRIPNAME(i) (stripformat[i]+3)
5142
5143static const Py_UNICODE *
5144unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5145{
Tim Peters030a5ce2002-04-22 19:00:10 +00005146 size_t i;
5147 for (i = 0; i < n; ++i)
5148 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005149 return s+i;
5150 return NULL;
5151}
5152
5153/* externally visible for str.strip(unicode) */
5154PyObject *
5155_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5156{
5157 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5158 int len = PyUnicode_GET_SIZE(self);
5159 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5160 int seplen = PyUnicode_GET_SIZE(sepobj);
5161 int i, j;
5162
5163 i = 0;
5164 if (striptype != RIGHTSTRIP) {
5165 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5166 i++;
5167 }
5168 }
5169
5170 j = len;
5171 if (striptype != LEFTSTRIP) {
5172 do {
5173 j--;
5174 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5175 j++;
5176 }
5177
5178 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5179 Py_INCREF(self);
5180 return (PyObject*)self;
5181 }
5182 else
5183 return PyUnicode_FromUnicode(s+i, j-i);
5184}
5185
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186
5187static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005188do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005190 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5191 int len = PyUnicode_GET_SIZE(self), i, j;
5192
5193 i = 0;
5194 if (striptype != RIGHTSTRIP) {
5195 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5196 i++;
5197 }
5198 }
5199
5200 j = len;
5201 if (striptype != LEFTSTRIP) {
5202 do {
5203 j--;
5204 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5205 j++;
5206 }
5207
5208 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5209 Py_INCREF(self);
5210 return (PyObject*)self;
5211 }
5212 else
5213 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214}
5215
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005216
5217static PyObject *
5218do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5219{
5220 PyObject *sep = NULL;
5221
5222 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5223 return NULL;
5224
5225 if (sep != NULL && sep != Py_None) {
5226 if (PyUnicode_Check(sep))
5227 return _PyUnicode_XStrip(self, striptype, sep);
5228 else if (PyString_Check(sep)) {
5229 PyObject *res;
5230 sep = PyUnicode_FromObject(sep);
5231 if (sep==NULL)
5232 return NULL;
5233 res = _PyUnicode_XStrip(self, striptype, sep);
5234 Py_DECREF(sep);
5235 return res;
5236 }
5237 else {
5238 PyErr_Format(PyExc_TypeError,
5239 "%s arg must be None, unicode or str",
5240 STRIPNAME(striptype));
5241 return NULL;
5242 }
5243 }
5244
5245 return do_strip(self, striptype);
5246}
5247
5248
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005249PyDoc_STRVAR(strip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005250"S.strip([sep]) -> unicode\n\
5251\n\
5252Return a copy of the string S with leading and trailing\n\
5253whitespace removed.\n\
5254If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005255If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005256
5257static PyObject *
5258unicode_strip(PyUnicodeObject *self, PyObject *args)
5259{
5260 if (PyTuple_GET_SIZE(args) == 0)
5261 return do_strip(self, BOTHSTRIP); /* Common case */
5262 else
5263 return do_argstrip(self, BOTHSTRIP, args);
5264}
5265
5266
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005267PyDoc_STRVAR(lstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005268"S.lstrip([sep]) -> unicode\n\
5269\n\
5270Return a copy of the string S with leading whitespace removed.\n\
5271If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005272If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005273
5274static PyObject *
5275unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5276{
5277 if (PyTuple_GET_SIZE(args) == 0)
5278 return do_strip(self, LEFTSTRIP); /* Common case */
5279 else
5280 return do_argstrip(self, LEFTSTRIP, args);
5281}
5282
5283
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005284PyDoc_STRVAR(rstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005285"S.rstrip([sep]) -> unicode\n\
5286\n\
5287Return a copy of the string S with trailing whitespace removed.\n\
5288If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005289If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005290
5291static PyObject *
5292unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5293{
5294 if (PyTuple_GET_SIZE(args) == 0)
5295 return do_strip(self, RIGHTSTRIP); /* Common case */
5296 else
5297 return do_argstrip(self, RIGHTSTRIP, args);
5298}
5299
5300
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301static PyObject*
5302unicode_repeat(PyUnicodeObject *str, int len)
5303{
5304 PyUnicodeObject *u;
5305 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005306 int nchars;
5307 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308
5309 if (len < 0)
5310 len = 0;
5311
Tim Peters7a29bd52001-09-12 03:03:31 +00005312 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 /* no repeat, return original string */
5314 Py_INCREF(str);
5315 return (PyObject*) str;
5316 }
Tim Peters8f422462000-09-09 06:13:41 +00005317
5318 /* ensure # of chars needed doesn't overflow int and # of bytes
5319 * needed doesn't overflow size_t
5320 */
5321 nchars = len * str->length;
5322 if (len && nchars / len != str->length) {
5323 PyErr_SetString(PyExc_OverflowError,
5324 "repeated string is too long");
5325 return NULL;
5326 }
5327 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5328 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5329 PyErr_SetString(PyExc_OverflowError,
5330 "repeated string is too long");
5331 return NULL;
5332 }
5333 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334 if (!u)
5335 return NULL;
5336
5337 p = u->str;
5338
5339 while (len-- > 0) {
5340 Py_UNICODE_COPY(p, str->str, str->length);
5341 p += str->length;
5342 }
5343
5344 return (PyObject*) u;
5345}
5346
5347PyObject *PyUnicode_Replace(PyObject *obj,
5348 PyObject *subobj,
5349 PyObject *replobj,
5350 int maxcount)
5351{
5352 PyObject *self;
5353 PyObject *str1;
5354 PyObject *str2;
5355 PyObject *result;
5356
5357 self = PyUnicode_FromObject(obj);
5358 if (self == NULL)
5359 return NULL;
5360 str1 = PyUnicode_FromObject(subobj);
5361 if (str1 == NULL) {
5362 Py_DECREF(self);
5363 return NULL;
5364 }
5365 str2 = PyUnicode_FromObject(replobj);
5366 if (str2 == NULL) {
5367 Py_DECREF(self);
5368 Py_DECREF(str1);
5369 return NULL;
5370 }
5371 result = replace((PyUnicodeObject *)self,
5372 (PyUnicodeObject *)str1,
5373 (PyUnicodeObject *)str2,
5374 maxcount);
5375 Py_DECREF(self);
5376 Py_DECREF(str1);
5377 Py_DECREF(str2);
5378 return result;
5379}
5380
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005381PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382"S.replace (old, new[, maxsplit]) -> unicode\n\
5383\n\
5384Return a copy of S with all occurrences of substring\n\
5385old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005386given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387
5388static PyObject*
5389unicode_replace(PyUnicodeObject *self, PyObject *args)
5390{
5391 PyUnicodeObject *str1;
5392 PyUnicodeObject *str2;
5393 int maxcount = -1;
5394 PyObject *result;
5395
5396 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5397 return NULL;
5398 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5399 if (str1 == NULL)
5400 return NULL;
5401 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005402 if (str2 == NULL) {
5403 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005405 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406
5407 result = replace(self, str1, str2, maxcount);
5408
5409 Py_DECREF(str1);
5410 Py_DECREF(str2);
5411 return result;
5412}
5413
5414static
5415PyObject *unicode_repr(PyObject *unicode)
5416{
5417 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5418 PyUnicode_GET_SIZE(unicode),
5419 1);
5420}
5421
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005422PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423"S.rfind(sub [,start [,end]]) -> int\n\
5424\n\
5425Return the highest index in S where substring sub is found,\n\
5426such that sub is contained within s[start,end]. Optional\n\
5427arguments start and end are interpreted as in slice notation.\n\
5428\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005429Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430
5431static PyObject *
5432unicode_rfind(PyUnicodeObject *self, PyObject *args)
5433{
5434 PyUnicodeObject *substring;
5435 int start = 0;
5436 int end = INT_MAX;
5437 PyObject *result;
5438
Guido van Rossumb8872e62000-05-09 14:14:27 +00005439 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5440 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 return NULL;
5442 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5443 (PyObject *)substring);
5444 if (substring == NULL)
5445 return NULL;
5446
5447 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5448
5449 Py_DECREF(substring);
5450 return result;
5451}
5452
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005453PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454"S.rindex(sub [,start [,end]]) -> int\n\
5455\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005456Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457
5458static PyObject *
5459unicode_rindex(PyUnicodeObject *self, PyObject *args)
5460{
5461 int result;
5462 PyUnicodeObject *substring;
5463 int start = 0;
5464 int end = INT_MAX;
5465
Guido van Rossumb8872e62000-05-09 14:14:27 +00005466 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5467 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 return NULL;
5469 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5470 (PyObject *)substring);
5471 if (substring == NULL)
5472 return NULL;
5473
5474 result = findstring(self, substring, start, end, -1);
5475
5476 Py_DECREF(substring);
5477 if (result < 0) {
5478 PyErr_SetString(PyExc_ValueError, "substring not found");
5479 return NULL;
5480 }
5481 return PyInt_FromLong(result);
5482}
5483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005484PyDoc_STRVAR(rjust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485"S.rjust(width) -> unicode\n\
5486\n\
5487Return S right justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005488done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489
5490static PyObject *
5491unicode_rjust(PyUnicodeObject *self, PyObject *args)
5492{
5493 int width;
5494 if (!PyArg_ParseTuple(args, "i:rjust", &width))
5495 return NULL;
5496
Tim Peters7a29bd52001-09-12 03:03:31 +00005497 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 Py_INCREF(self);
5499 return (PyObject*) self;
5500 }
5501
5502 return (PyObject*) pad(self, width - self->length, 0, ' ');
5503}
5504
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505static PyObject*
5506unicode_slice(PyUnicodeObject *self, int start, int end)
5507{
5508 /* standard clamping */
5509 if (start < 0)
5510 start = 0;
5511 if (end < 0)
5512 end = 0;
5513 if (end > self->length)
5514 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005515 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516 /* full slice, return original string */
5517 Py_INCREF(self);
5518 return (PyObject*) self;
5519 }
5520 if (start > end)
5521 start = end;
5522 /* copy slice */
5523 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5524 end - start);
5525}
5526
5527PyObject *PyUnicode_Split(PyObject *s,
5528 PyObject *sep,
5529 int maxsplit)
5530{
5531 PyObject *result;
5532
5533 s = PyUnicode_FromObject(s);
5534 if (s == NULL)
5535 return NULL;
5536 if (sep != NULL) {
5537 sep = PyUnicode_FromObject(sep);
5538 if (sep == NULL) {
5539 Py_DECREF(s);
5540 return NULL;
5541 }
5542 }
5543
5544 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5545
5546 Py_DECREF(s);
5547 Py_XDECREF(sep);
5548 return result;
5549}
5550
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005551PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552"S.split([sep [,maxsplit]]) -> list of strings\n\
5553\n\
5554Return a list of the words in S, using sep as the\n\
5555delimiter string. If maxsplit is given, at most maxsplit\n\
5556splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005557is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558
5559static PyObject*
5560unicode_split(PyUnicodeObject *self, PyObject *args)
5561{
5562 PyObject *substring = Py_None;
5563 int maxcount = -1;
5564
5565 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5566 return NULL;
5567
5568 if (substring == Py_None)
5569 return split(self, NULL, maxcount);
5570 else if (PyUnicode_Check(substring))
5571 return split(self, (PyUnicodeObject *)substring, maxcount);
5572 else
5573 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5574}
5575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005576PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00005577"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578\n\
5579Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00005580Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005581is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582
5583static PyObject*
5584unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5585{
Guido van Rossum86662912000-04-11 15:38:46 +00005586 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587
Guido van Rossum86662912000-04-11 15:38:46 +00005588 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 return NULL;
5590
Guido van Rossum86662912000-04-11 15:38:46 +00005591 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592}
5593
5594static
5595PyObject *unicode_str(PyUnicodeObject *self)
5596{
Fred Drakee4315f52000-05-09 19:53:39 +00005597 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598}
5599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005600PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601"S.swapcase() -> unicode\n\
5602\n\
5603Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005604and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605
5606static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005607unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 return fixup(self, fixswapcase);
5610}
5611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005612PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613"S.translate(table) -> unicode\n\
5614\n\
5615Return a copy of the string S, where all characters have been mapped\n\
5616through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00005617Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5618Unmapped characters are left untouched. Characters mapped to None\n\
5619are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620
5621static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005622unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 return PyUnicode_TranslateCharmap(self->str,
5625 self->length,
5626 table,
5627 "ignore");
5628}
5629
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005630PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631"S.upper() -> unicode\n\
5632\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005633Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634
5635static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005636unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 return fixup(self, fixupper);
5639}
5640
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005641PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642"S.zfill(width) -> unicode\n\
5643\n\
5644Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005645of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646
5647static PyObject *
5648unicode_zfill(PyUnicodeObject *self, PyObject *args)
5649{
5650 int fill;
5651 PyUnicodeObject *u;
5652
5653 int width;
5654 if (!PyArg_ParseTuple(args, "i:zfill", &width))
5655 return NULL;
5656
5657 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00005658 if (PyUnicode_CheckExact(self)) {
5659 Py_INCREF(self);
5660 return (PyObject*) self;
5661 }
5662 else
5663 return PyUnicode_FromUnicode(
5664 PyUnicode_AS_UNICODE(self),
5665 PyUnicode_GET_SIZE(self)
5666 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 }
5668
5669 fill = width - self->length;
5670
5671 u = pad(self, fill, 0, '0');
5672
Walter Dörwald068325e2002-04-15 13:36:47 +00005673 if (u == NULL)
5674 return NULL;
5675
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 if (u->str[fill] == '+' || u->str[fill] == '-') {
5677 /* move sign to beginning of string */
5678 u->str[0] = u->str[fill];
5679 u->str[fill] = '0';
5680 }
5681
5682 return (PyObject*) u;
5683}
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684
5685#if 0
5686static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005687unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 return PyInt_FromLong(unicode_freelist_size);
5690}
5691#endif
5692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005693PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005694"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005696Return True if S starts with the specified prefix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005698comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699
5700static PyObject *
5701unicode_startswith(PyUnicodeObject *self,
5702 PyObject *args)
5703{
5704 PyUnicodeObject *substring;
5705 int start = 0;
5706 int end = INT_MAX;
5707 PyObject *result;
5708
Guido van Rossumb8872e62000-05-09 14:14:27 +00005709 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5710 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 return NULL;
5712 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5713 (PyObject *)substring);
5714 if (substring == NULL)
5715 return NULL;
5716
Guido van Rossum77f6a652002-04-03 22:41:51 +00005717 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718
5719 Py_DECREF(substring);
5720 return result;
5721}
5722
5723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005724PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005725"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005727Return True if S ends with the specified suffix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005729comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730
5731static PyObject *
5732unicode_endswith(PyUnicodeObject *self,
5733 PyObject *args)
5734{
5735 PyUnicodeObject *substring;
5736 int start = 0;
5737 int end = INT_MAX;
5738 PyObject *result;
5739
Guido van Rossumb8872e62000-05-09 14:14:27 +00005740 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5741 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 return NULL;
5743 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5744 (PyObject *)substring);
5745 if (substring == NULL)
5746 return NULL;
5747
Guido van Rossum77f6a652002-04-03 22:41:51 +00005748 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749
5750 Py_DECREF(substring);
5751 return result;
5752}
5753
5754
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005755
5756static PyObject *
5757unicode_getnewargs(PyUnicodeObject *v)
5758{
5759 return Py_BuildValue("(u#)", v->str, v->length);
5760}
5761
5762
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763static PyMethodDef unicode_methods[] = {
5764
5765 /* Order is according to common usage: often used methods should
5766 appear first, since lookup is done sequentially. */
5767
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005768 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5769 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5770 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5771 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5772 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5773 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5774 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5775 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5776 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5777 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5778 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5779 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5780 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005781 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005782/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5783 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5784 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5785 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005786 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005787 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005788 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005789 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5790 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5791 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5792 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5793 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5794 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5795 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5796 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5797 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5798 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5799 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5800 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5801 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5802 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005803 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00005804#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005805 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806#endif
5807
5808#if 0
5809 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005810 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811#endif
5812
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005813 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814 {NULL, NULL}
5815};
5816
Neil Schemenauerce30bc92002-11-18 16:10:18 +00005817static PyObject *
5818unicode_mod(PyObject *v, PyObject *w)
5819{
5820 if (!PyUnicode_Check(v)) {
5821 Py_INCREF(Py_NotImplemented);
5822 return Py_NotImplemented;
5823 }
5824 return PyUnicode_Format(v, w);
5825}
5826
5827static PyNumberMethods unicode_as_number = {
5828 0, /*nb_add*/
5829 0, /*nb_subtract*/
5830 0, /*nb_multiply*/
5831 0, /*nb_divide*/
5832 unicode_mod, /*nb_remainder*/
5833};
5834
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835static PySequenceMethods unicode_as_sequence = {
5836 (inquiry) unicode_length, /* sq_length */
5837 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5838 (intargfunc) unicode_repeat, /* sq_repeat */
5839 (intargfunc) unicode_getitem, /* sq_item */
5840 (intintargfunc) unicode_slice, /* sq_slice */
5841 0, /* sq_ass_item */
5842 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005843 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844};
5845
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005846static PyObject*
5847unicode_subscript(PyUnicodeObject* self, PyObject* item)
5848{
5849 if (PyInt_Check(item)) {
5850 long i = PyInt_AS_LONG(item);
5851 if (i < 0)
5852 i += PyString_GET_SIZE(self);
5853 return unicode_getitem(self, i);
5854 } else if (PyLong_Check(item)) {
5855 long i = PyLong_AsLong(item);
5856 if (i == -1 && PyErr_Occurred())
5857 return NULL;
5858 if (i < 0)
5859 i += PyString_GET_SIZE(self);
5860 return unicode_getitem(self, i);
5861 } else if (PySlice_Check(item)) {
5862 int start, stop, step, slicelength, cur, i;
5863 Py_UNICODE* source_buf;
5864 Py_UNICODE* result_buf;
5865 PyObject* result;
5866
5867 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5868 &start, &stop, &step, &slicelength) < 0) {
5869 return NULL;
5870 }
5871
5872 if (slicelength <= 0) {
5873 return PyUnicode_FromUnicode(NULL, 0);
5874 } else {
5875 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5876 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5877
5878 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5879 result_buf[i] = source_buf[cur];
5880 }
5881
5882 result = PyUnicode_FromUnicode(result_buf, slicelength);
5883 PyMem_FREE(result_buf);
5884 return result;
5885 }
5886 } else {
5887 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5888 return NULL;
5889 }
5890}
5891
5892static PyMappingMethods unicode_as_mapping = {
5893 (inquiry)unicode_length, /* mp_length */
5894 (binaryfunc)unicode_subscript, /* mp_subscript */
5895 (objobjargproc)0, /* mp_ass_subscript */
5896};
5897
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898static int
5899unicode_buffer_getreadbuf(PyUnicodeObject *self,
5900 int index,
5901 const void **ptr)
5902{
5903 if (index != 0) {
5904 PyErr_SetString(PyExc_SystemError,
5905 "accessing non-existent unicode segment");
5906 return -1;
5907 }
5908 *ptr = (void *) self->str;
5909 return PyUnicode_GET_DATA_SIZE(self);
5910}
5911
5912static int
5913unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5914 const void **ptr)
5915{
5916 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00005917 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 return -1;
5919}
5920
5921static int
5922unicode_buffer_getsegcount(PyUnicodeObject *self,
5923 int *lenp)
5924{
5925 if (lenp)
5926 *lenp = PyUnicode_GET_DATA_SIZE(self);
5927 return 1;
5928}
5929
5930static int
5931unicode_buffer_getcharbuf(PyUnicodeObject *self,
5932 int index,
5933 const void **ptr)
5934{
5935 PyObject *str;
5936
5937 if (index != 0) {
5938 PyErr_SetString(PyExc_SystemError,
5939 "accessing non-existent unicode segment");
5940 return -1;
5941 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005942 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 if (str == NULL)
5944 return -1;
5945 *ptr = (void *) PyString_AS_STRING(str);
5946 return PyString_GET_SIZE(str);
5947}
5948
5949/* Helpers for PyUnicode_Format() */
5950
5951static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005952getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953{
5954 int argidx = *p_argidx;
5955 if (argidx < arglen) {
5956 (*p_argidx)++;
5957 if (arglen < 0)
5958 return args;
5959 else
5960 return PyTuple_GetItem(args, argidx);
5961 }
5962 PyErr_SetString(PyExc_TypeError,
5963 "not enough arguments for format string");
5964 return NULL;
5965}
5966
5967#define F_LJUST (1<<0)
5968#define F_SIGN (1<<1)
5969#define F_BLANK (1<<2)
5970#define F_ALT (1<<3)
5971#define F_ZERO (1<<4)
5972
5973static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975{
5976 register int i;
5977 int len;
5978 va_list va;
5979 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981
5982 /* First, format the string as char array, then expand to Py_UNICODE
5983 array. */
5984 charbuffer = (char *)buffer;
5985 len = vsprintf(charbuffer, format, va);
5986 for (i = len - 1; i >= 0; i--)
5987 buffer[i] = (Py_UNICODE) charbuffer[i];
5988
5989 va_end(va);
5990 return len;
5991}
5992
Guido van Rossum078151d2002-08-11 04:24:12 +00005993/* XXX To save some code duplication, formatfloat/long/int could have been
5994 shared with stringobject.c, converting from 8-bit to Unicode after the
5995 formatting is done. */
5996
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997static int
5998formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005999 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 int flags,
6001 int prec,
6002 int type,
6003 PyObject *v)
6004{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006005 /* fmt = '%#.' + `prec` + `type`
6006 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 char fmt[20];
6008 double x;
6009
6010 x = PyFloat_AsDouble(v);
6011 if (x == -1.0 && PyErr_Occurred())
6012 return -1;
6013 if (prec < 0)
6014 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6016 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006017 /* Worst case length calc to ensure no buffer overrun:
6018
6019 'g' formats:
6020 fmt = %#.<prec>g
6021 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6022 for any double rep.)
6023 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6024
6025 'f' formats:
6026 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6027 len = 1 + 50 + 1 + prec = 52 + prec
6028
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006029 If prec=0 the effective precision is 1 (the leading digit is
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006030 always given), therefore increase the length by one.
6031
6032 */
6033 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6034 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006035 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006036 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006037 return -1;
6038 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006039 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6040 (flags&F_ALT) ? "#" : "",
6041 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042 return usprintf(buf, fmt, x);
6043}
6044
Tim Peters38fd5b62000-09-21 05:43:11 +00006045static PyObject*
6046formatlong(PyObject *val, int flags, int prec, int type)
6047{
6048 char *buf;
6049 int i, len;
6050 PyObject *str; /* temporary string object. */
6051 PyUnicodeObject *result;
6052
6053 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6054 if (!str)
6055 return NULL;
6056 result = _PyUnicode_New(len);
6057 for (i = 0; i < len; i++)
6058 result->str[i] = buf[i];
6059 result->str[len] = 0;
6060 Py_DECREF(str);
6061 return (PyObject*)result;
6062}
6063
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064static int
6065formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006066 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067 int flags,
6068 int prec,
6069 int type,
6070 PyObject *v)
6071{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006072 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006073 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6074 * + 1 + 1
6075 * = 24
6076 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006077 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 long x;
6079
6080 x = PyInt_AsLong(v);
6081 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006082 return -1;
Guido van Rossum078151d2002-08-11 04:24:12 +00006083 if (x < 0 && type != 'd' && type != 'i') {
Guido van Rossum54df53a2002-08-14 18:38:27 +00006084 if (PyErr_Warn(PyExc_FutureWarning,
Guido van Rossum078151d2002-08-11 04:24:12 +00006085 "%u/%o/%x/%X of negative int will return "
6086 "a signed string in Python 2.4 and up") < 0)
6087 return -1;
6088 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006090 prec = 1;
6091
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006092 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006093 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
6094 */
6095 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006096 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006097 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006098 return -1;
6099 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006100
6101 if ((flags & F_ALT) &&
6102 (type == 'x' || type == 'X')) {
6103 /* When converting under %#x or %#X, there are a number
6104 * of issues that cause pain:
6105 * - when 0 is being converted, the C standard leaves off
6106 * the '0x' or '0X', which is inconsistent with other
6107 * %#x/%#X conversions and inconsistent with Python's
6108 * hex() function
6109 * - there are platforms that violate the standard and
6110 * convert 0 with the '0x' or '0X'
6111 * (Metrowerks, Compaq Tru64)
6112 * - there are platforms that give '0x' when converting
6113 * under %#X, but convert 0 in accordance with the
6114 * standard (OS/2 EMX)
6115 *
6116 * We can achieve the desired consistency by inserting our
6117 * own '0x' or '0X' prefix, and substituting %x/%X in place
6118 * of %#x/%#X.
6119 *
6120 * Note that this is the same approach as used in
6121 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006122 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006123 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
6124 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006125 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006126 else {
6127 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
6128 (flags&F_ALT) ? "#" : "",
6129 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006130 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 return usprintf(buf, fmt, x);
6132}
6133
6134static int
6135formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006136 size_t buflen,
6137 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006139 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006140 if (PyUnicode_Check(v)) {
6141 if (PyUnicode_GET_SIZE(v) != 1)
6142 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006144 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006146 else if (PyString_Check(v)) {
6147 if (PyString_GET_SIZE(v) != 1)
6148 goto onError;
6149 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6150 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151
6152 else {
6153 /* Integer input truncated to a character */
6154 long x;
6155 x = PyInt_AsLong(v);
6156 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006157 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006158#ifdef Py_UNICODE_WIDE
6159 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006160 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006161 "%c arg not in range(0x110000) "
6162 "(wide Python build)");
6163 return -1;
6164 }
6165#else
6166 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006167 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006168 "%c arg not in range(0x10000) "
6169 "(narrow Python build)");
6170 return -1;
6171 }
6172#endif
6173 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174 }
6175 buf[1] = '\0';
6176 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006177
6178 onError:
6179 PyErr_SetString(PyExc_TypeError,
6180 "%c requires int or char");
6181 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182}
6183
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006184/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6185
6186 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6187 chars are formatted. XXX This is a magic number. Each formatting
6188 routine does bounds checking to ensure no overflow, but a better
6189 solution may be to malloc a buffer of appropriate size for each
6190 format. For now, the current solution is sufficient.
6191*/
6192#define FORMATBUFLEN (size_t)120
6193
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194PyObject *PyUnicode_Format(PyObject *format,
6195 PyObject *args)
6196{
6197 Py_UNICODE *fmt, *res;
6198 int fmtcnt, rescnt, reslen, arglen, argidx;
6199 int args_owned = 0;
6200 PyUnicodeObject *result = NULL;
6201 PyObject *dict = NULL;
6202 PyObject *uformat;
6203
6204 if (format == NULL || args == NULL) {
6205 PyErr_BadInternalCall();
6206 return NULL;
6207 }
6208 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006209 if (uformat == NULL)
6210 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 fmt = PyUnicode_AS_UNICODE(uformat);
6212 fmtcnt = PyUnicode_GET_SIZE(uformat);
6213
6214 reslen = rescnt = fmtcnt + 100;
6215 result = _PyUnicode_New(reslen);
6216 if (result == NULL)
6217 goto onError;
6218 res = PyUnicode_AS_UNICODE(result);
6219
6220 if (PyTuple_Check(args)) {
6221 arglen = PyTuple_Size(args);
6222 argidx = 0;
6223 }
6224 else {
6225 arglen = -1;
6226 argidx = -2;
6227 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006228 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6229 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230 dict = args;
6231
6232 while (--fmtcnt >= 0) {
6233 if (*fmt != '%') {
6234 if (--rescnt < 0) {
6235 rescnt = fmtcnt + 100;
6236 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006237 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 return NULL;
6239 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6240 --rescnt;
6241 }
6242 *res++ = *fmt++;
6243 }
6244 else {
6245 /* Got a format specifier */
6246 int flags = 0;
6247 int width = -1;
6248 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249 Py_UNICODE c = '\0';
6250 Py_UNICODE fill;
6251 PyObject *v = NULL;
6252 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006253 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254 Py_UNICODE sign;
6255 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006256 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257
6258 fmt++;
6259 if (*fmt == '(') {
6260 Py_UNICODE *keystart;
6261 int keylen;
6262 PyObject *key;
6263 int pcount = 1;
6264
6265 if (dict == NULL) {
6266 PyErr_SetString(PyExc_TypeError,
6267 "format requires a mapping");
6268 goto onError;
6269 }
6270 ++fmt;
6271 --fmtcnt;
6272 keystart = fmt;
6273 /* Skip over balanced parentheses */
6274 while (pcount > 0 && --fmtcnt >= 0) {
6275 if (*fmt == ')')
6276 --pcount;
6277 else if (*fmt == '(')
6278 ++pcount;
6279 fmt++;
6280 }
6281 keylen = fmt - keystart - 1;
6282 if (fmtcnt < 0 || pcount > 0) {
6283 PyErr_SetString(PyExc_ValueError,
6284 "incomplete format key");
6285 goto onError;
6286 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006287#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006288 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 then looked up since Python uses strings to hold
6290 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006291 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292 key = PyUnicode_EncodeUTF8(keystart,
6293 keylen,
6294 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006295#else
6296 key = PyUnicode_FromUnicode(keystart, keylen);
6297#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 if (key == NULL)
6299 goto onError;
6300 if (args_owned) {
6301 Py_DECREF(args);
6302 args_owned = 0;
6303 }
6304 args = PyObject_GetItem(dict, key);
6305 Py_DECREF(key);
6306 if (args == NULL) {
6307 goto onError;
6308 }
6309 args_owned = 1;
6310 arglen = -1;
6311 argidx = -2;
6312 }
6313 while (--fmtcnt >= 0) {
6314 switch (c = *fmt++) {
6315 case '-': flags |= F_LJUST; continue;
6316 case '+': flags |= F_SIGN; continue;
6317 case ' ': flags |= F_BLANK; continue;
6318 case '#': flags |= F_ALT; continue;
6319 case '0': flags |= F_ZERO; continue;
6320 }
6321 break;
6322 }
6323 if (c == '*') {
6324 v = getnextarg(args, arglen, &argidx);
6325 if (v == NULL)
6326 goto onError;
6327 if (!PyInt_Check(v)) {
6328 PyErr_SetString(PyExc_TypeError,
6329 "* wants int");
6330 goto onError;
6331 }
6332 width = PyInt_AsLong(v);
6333 if (width < 0) {
6334 flags |= F_LJUST;
6335 width = -width;
6336 }
6337 if (--fmtcnt >= 0)
6338 c = *fmt++;
6339 }
6340 else if (c >= '0' && c <= '9') {
6341 width = c - '0';
6342 while (--fmtcnt >= 0) {
6343 c = *fmt++;
6344 if (c < '0' || c > '9')
6345 break;
6346 if ((width*10) / 10 != width) {
6347 PyErr_SetString(PyExc_ValueError,
6348 "width too big");
6349 goto onError;
6350 }
6351 width = width*10 + (c - '0');
6352 }
6353 }
6354 if (c == '.') {
6355 prec = 0;
6356 if (--fmtcnt >= 0)
6357 c = *fmt++;
6358 if (c == '*') {
6359 v = getnextarg(args, arglen, &argidx);
6360 if (v == NULL)
6361 goto onError;
6362 if (!PyInt_Check(v)) {
6363 PyErr_SetString(PyExc_TypeError,
6364 "* wants int");
6365 goto onError;
6366 }
6367 prec = PyInt_AsLong(v);
6368 if (prec < 0)
6369 prec = 0;
6370 if (--fmtcnt >= 0)
6371 c = *fmt++;
6372 }
6373 else if (c >= '0' && c <= '9') {
6374 prec = c - '0';
6375 while (--fmtcnt >= 0) {
6376 c = Py_CHARMASK(*fmt++);
6377 if (c < '0' || c > '9')
6378 break;
6379 if ((prec*10) / 10 != prec) {
6380 PyErr_SetString(PyExc_ValueError,
6381 "prec too big");
6382 goto onError;
6383 }
6384 prec = prec*10 + (c - '0');
6385 }
6386 }
6387 } /* prec */
6388 if (fmtcnt >= 0) {
6389 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390 if (--fmtcnt >= 0)
6391 c = *fmt++;
6392 }
6393 }
6394 if (fmtcnt < 0) {
6395 PyErr_SetString(PyExc_ValueError,
6396 "incomplete format");
6397 goto onError;
6398 }
6399 if (c != '%') {
6400 v = getnextarg(args, arglen, &argidx);
6401 if (v == NULL)
6402 goto onError;
6403 }
6404 sign = 0;
6405 fill = ' ';
6406 switch (c) {
6407
6408 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006409 pbuf = formatbuf;
6410 /* presume that buffer length is at least 1 */
6411 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 len = 1;
6413 break;
6414
6415 case 's':
6416 case 'r':
6417 if (PyUnicode_Check(v) && c == 's') {
6418 temp = v;
6419 Py_INCREF(temp);
6420 }
6421 else {
6422 PyObject *unicode;
6423 if (c == 's')
6424 temp = PyObject_Str(v);
6425 else
6426 temp = PyObject_Repr(v);
6427 if (temp == NULL)
6428 goto onError;
6429 if (!PyString_Check(temp)) {
6430 /* XXX Note: this should never happen, since
6431 PyObject_Repr() and PyObject_Str() assure
6432 this */
6433 Py_DECREF(temp);
6434 PyErr_SetString(PyExc_TypeError,
6435 "%s argument has non-string str()");
6436 goto onError;
6437 }
Fred Drakee4315f52000-05-09 19:53:39 +00006438 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006440 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 "strict");
6442 Py_DECREF(temp);
6443 temp = unicode;
6444 if (temp == NULL)
6445 goto onError;
6446 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006447 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 len = PyUnicode_GET_SIZE(temp);
6449 if (prec >= 0 && len > prec)
6450 len = prec;
6451 break;
6452
6453 case 'i':
6454 case 'd':
6455 case 'u':
6456 case 'o':
6457 case 'x':
6458 case 'X':
6459 if (c == 'i')
6460 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006461 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006462 temp = formatlong(v, flags, prec, c);
6463 if (!temp)
6464 goto onError;
6465 pbuf = PyUnicode_AS_UNICODE(temp);
6466 len = PyUnicode_GET_SIZE(temp);
6467 /* unbounded ints can always produce
6468 a sign character! */
6469 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006471 else {
6472 pbuf = formatbuf;
6473 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6474 flags, prec, c, v);
6475 if (len < 0)
6476 goto onError;
6477 /* only d conversion is signed */
6478 sign = c == 'd';
6479 }
6480 if (flags & F_ZERO)
6481 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 break;
6483
6484 case 'e':
6485 case 'E':
6486 case 'f':
6487 case 'g':
6488 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006489 pbuf = formatbuf;
6490 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6491 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492 if (len < 0)
6493 goto onError;
6494 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006495 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496 fill = '0';
6497 break;
6498
6499 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006500 pbuf = formatbuf;
6501 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502 if (len < 0)
6503 goto onError;
6504 break;
6505
6506 default:
6507 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006508 "unsupported format character '%c' (0x%x) "
6509 "at index %i",
Neal Norwitza0378e12002-09-13 13:47:06 +00006510 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006511 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006512 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513 goto onError;
6514 }
6515 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006516 if (*pbuf == '-' || *pbuf == '+') {
6517 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518 len--;
6519 }
6520 else if (flags & F_SIGN)
6521 sign = '+';
6522 else if (flags & F_BLANK)
6523 sign = ' ';
6524 else
6525 sign = 0;
6526 }
6527 if (width < len)
6528 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006529 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 reslen -= rescnt;
6531 rescnt = width + fmtcnt + 100;
6532 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006533 if (reslen < 0) {
6534 Py_DECREF(result);
6535 return PyErr_NoMemory();
6536 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006537 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538 return NULL;
6539 res = PyUnicode_AS_UNICODE(result)
6540 + reslen - rescnt;
6541 }
6542 if (sign) {
6543 if (fill != ' ')
6544 *res++ = sign;
6545 rescnt--;
6546 if (width > len)
6547 width--;
6548 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006549 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6550 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006551 assert(pbuf[1] == c);
6552 if (fill != ' ') {
6553 *res++ = *pbuf++;
6554 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00006555 }
Tim Petersfff53252001-04-12 18:38:48 +00006556 rescnt -= 2;
6557 width -= 2;
6558 if (width < 0)
6559 width = 0;
6560 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00006561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 if (width > len && !(flags & F_LJUST)) {
6563 do {
6564 --rescnt;
6565 *res++ = fill;
6566 } while (--width > len);
6567 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006568 if (fill == ' ') {
6569 if (sign)
6570 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00006571 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006572 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006573 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00006574 *res++ = *pbuf++;
6575 *res++ = *pbuf++;
6576 }
6577 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006578 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 res += len;
6580 rescnt -= len;
6581 while (--width >= len) {
6582 --rescnt;
6583 *res++ = ' ';
6584 }
6585 if (dict && (argidx < arglen) && c != '%') {
6586 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006587 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 goto onError;
6589 }
6590 Py_XDECREF(temp);
6591 } /* '%' */
6592 } /* until end */
6593 if (argidx < arglen && !dict) {
6594 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006595 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596 goto onError;
6597 }
6598
6599 if (args_owned) {
6600 Py_DECREF(args);
6601 }
6602 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006603 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006604 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 return (PyObject *)result;
6606
6607 onError:
6608 Py_XDECREF(result);
6609 Py_DECREF(uformat);
6610 if (args_owned) {
6611 Py_DECREF(args);
6612 }
6613 return NULL;
6614}
6615
6616static PyBufferProcs unicode_as_buffer = {
6617 (getreadbufferproc) unicode_buffer_getreadbuf,
6618 (getwritebufferproc) unicode_buffer_getwritebuf,
6619 (getsegcountproc) unicode_buffer_getsegcount,
6620 (getcharbufferproc) unicode_buffer_getcharbuf,
6621};
6622
Jeremy Hylton938ace62002-07-17 16:30:39 +00006623static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00006624unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6625
Tim Peters6d6c1a32001-08-02 04:15:00 +00006626static PyObject *
6627unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6628{
6629 PyObject *x = NULL;
6630 static char *kwlist[] = {"string", "encoding", "errors", 0};
6631 char *encoding = NULL;
6632 char *errors = NULL;
6633
Guido van Rossume023fe02001-08-30 03:12:59 +00006634 if (type != &PyUnicode_Type)
6635 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00006636 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6637 kwlist, &x, &encoding, &errors))
6638 return NULL;
6639 if (x == NULL)
6640 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00006641 if (encoding == NULL && errors == NULL)
6642 return PyObject_Unicode(x);
6643 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00006644 return PyUnicode_FromEncodedObject(x, encoding, errors);
6645}
6646
Guido van Rossume023fe02001-08-30 03:12:59 +00006647static PyObject *
6648unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6649{
Tim Petersaf90b3e2001-09-12 05:18:58 +00006650 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006651 int n;
6652
6653 assert(PyType_IsSubtype(type, &PyUnicode_Type));
6654 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6655 if (tmp == NULL)
6656 return NULL;
6657 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00006658 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
6659 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00006660 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00006661 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6662 if (pnew->str == NULL) {
6663 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006664 PyObject_Del(pnew);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00006665 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00006666 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006667 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6668 pnew->length = n;
6669 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00006670 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00006671 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006672}
6673
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006674PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00006675"unicode(string [, encoding[, errors]]) -> object\n\
6676\n\
6677Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00006678encoding defaults to the current default string encoding.\n\
6679errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00006680
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681PyTypeObject PyUnicode_Type = {
6682 PyObject_HEAD_INIT(&PyType_Type)
6683 0, /* ob_size */
6684 "unicode", /* tp_name */
6685 sizeof(PyUnicodeObject), /* tp_size */
6686 0, /* tp_itemsize */
6687 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00006688 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006690 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691 0, /* tp_setattr */
6692 (cmpfunc) unicode_compare, /* tp_compare */
6693 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006694 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006696 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697 (hashfunc) unicode_hash, /* tp_hash*/
6698 0, /* tp_call*/
6699 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006700 PyObject_GenericGetAttr, /* tp_getattro */
6701 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006703 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
6704 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006705 unicode_doc, /* tp_doc */
6706 0, /* tp_traverse */
6707 0, /* tp_clear */
6708 0, /* tp_richcompare */
6709 0, /* tp_weaklistoffset */
6710 0, /* tp_iter */
6711 0, /* tp_iternext */
6712 unicode_methods, /* tp_methods */
6713 0, /* tp_members */
6714 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00006715 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006716 0, /* tp_dict */
6717 0, /* tp_descr_get */
6718 0, /* tp_descr_set */
6719 0, /* tp_dictoffset */
6720 0, /* tp_init */
6721 0, /* tp_alloc */
6722 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006723 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724};
6725
6726/* Initialize the Unicode implementation */
6727
Thomas Wouters78890102000-07-22 19:25:51 +00006728void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006730 int i;
6731
Fred Drakee4315f52000-05-09 19:53:39 +00006732 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006733 unicode_freelist = NULL;
6734 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00006736 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006737 for (i = 0; i < 256; i++)
6738 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00006739 if (PyType_Ready(&PyUnicode_Type) < 0)
6740 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741}
6742
6743/* Finalize the Unicode implementation */
6744
6745void
Thomas Wouters78890102000-07-22 19:25:51 +00006746_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006748 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006749 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00006751 Py_XDECREF(unicode_empty);
6752 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006753
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006754 for (i = 0; i < 256; i++) {
6755 if (unicode_latin1[i]) {
6756 Py_DECREF(unicode_latin1[i]);
6757 unicode_latin1[i] = NULL;
6758 }
6759 }
6760
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006761 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762 PyUnicodeObject *v = u;
6763 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006764 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00006765 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006766 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006767 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006769 unicode_freelist = NULL;
6770 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771}