blob: e3bef575406fc9aeec9ed29fadc5a1ac781cef8b [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
157 unicode->length = length;
158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000266int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
371 for (i = size; i >= 0; i--)
372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
387 if (size > PyUnicode_GET_SIZE(unicode))
388 size = PyUnicode_GET_SIZE(unicode);
389#ifdef HAVE_USABLE_WCHAR_T
390 memcpy(w, unicode->str, size * sizeof(wchar_t));
391#else
392 {
393 register Py_UNICODE *u;
394 register int i;
395 u = PyUnicode_AS_UNICODE(unicode);
396 for (i = size; i >= 0; i--)
397 *w++ = *u++;
398 }
399#endif
400
401 return size;
402}
403
404#endif
405
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000406PyObject *PyUnicode_FromOrdinal(int ordinal)
407{
408 Py_UNICODE s[2];
409
410#ifdef Py_UNICODE_WIDE
411 if (ordinal < 0 || ordinal > 0x10ffff) {
412 PyErr_SetString(PyExc_ValueError,
413 "unichr() arg not in range(0x110000) "
414 "(wide Python build)");
415 return NULL;
416 }
417#else
418 if (ordinal < 0 || ordinal > 0xffff) {
419 PyErr_SetString(PyExc_ValueError,
420 "unichr() arg not in range(0x10000) "
421 "(narrow Python build)");
422 return NULL;
423 }
424#endif
425
426 if (ordinal <= 0xffff) {
427 /* UCS-2 character */
428 s[0] = (Py_UNICODE) ordinal;
429 return PyUnicode_FromUnicode(s, 1);
430 }
431 else {
432#ifndef Py_UNICODE_WIDE
433 /* UCS-4 character. store as two surrogate characters */
434 ordinal -= 0x10000L;
435 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
436 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
437 return PyUnicode_FromUnicode(s, 2);
438#else
439 s[0] = (Py_UNICODE)ordinal;
440 return PyUnicode_FromUnicode(s, 1);
441#endif
442 }
443}
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445PyObject *PyUnicode_FromObject(register PyObject *obj)
446{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000447 /* XXX Perhaps we should make this API an alias of
448 PyObject_Unicode() instead ?! */
449 if (PyUnicode_CheckExact(obj)) {
450 Py_INCREF(obj);
451 return obj;
452 }
453 if (PyUnicode_Check(obj)) {
454 /* For a Unicode subtype that's not a Unicode object,
455 return a true Unicode object with the same data. */
456 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
457 PyUnicode_GET_SIZE(obj));
458 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
460}
461
462PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
463 const char *encoding,
464 const char *errors)
465{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000466 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000468 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000469
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470 if (obj == NULL) {
471 PyErr_BadInternalCall();
472 return NULL;
473 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000474
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000475#if 0
476 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000477 that no encodings is given and then redirect to
478 PyObject_Unicode() which then applies the additional logic for
479 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000480
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000481 NOTE: This API should really only be used for object which
482 represent *encoded* Unicode !
483
484 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000485 if (PyUnicode_Check(obj)) {
486 if (encoding) {
487 PyErr_SetString(PyExc_TypeError,
488 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000489 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000490 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000492 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000493#else
494 if (PyUnicode_Check(obj)) {
495 PyErr_SetString(PyExc_TypeError,
496 "decoding Unicode is not supported");
497 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000498 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000499#endif
500
501 /* Coerce object */
502 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000503 s = PyString_AS_STRING(obj);
504 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000505 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000506 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
507 /* Overwrite the error message with something more useful in
508 case of a TypeError. */
509 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000511 "coercing to Unicode: need string or buffer, "
512 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000513 obj->ob_type->tp_name);
514 goto onError;
515 }
Tim Petersced69f82003-09-16 20:30:58 +0000516
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000517 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000518 if (len == 0) {
519 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000520 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000521 }
Tim Petersced69f82003-09-16 20:30:58 +0000522 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000523 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000524
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000525 return v;
526
527 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000528 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000529}
530
531PyObject *PyUnicode_Decode(const char *s,
532 int size,
533 const char *encoding,
534 const char *errors)
535{
536 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000537
538 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000539 encoding = PyUnicode_GetDefaultEncoding();
540
541 /* Shortcuts for common default encodings */
542 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000544 else if (strcmp(encoding, "latin-1") == 0)
545 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000546#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
547 else if (strcmp(encoding, "mbcs") == 0)
548 return PyUnicode_DecodeMBCS(s, size, errors);
549#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000550 else if (strcmp(encoding, "ascii") == 0)
551 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000552
553 /* Decode via the codec registry */
554 buffer = PyBuffer_FromMemory((void *)s, size);
555 if (buffer == NULL)
556 goto onError;
557 unicode = PyCodec_Decode(buffer, encoding, errors);
558 if (unicode == NULL)
559 goto onError;
560 if (!PyUnicode_Check(unicode)) {
561 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000562 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000563 unicode->ob_type->tp_name);
564 Py_DECREF(unicode);
565 goto onError;
566 }
567 Py_DECREF(buffer);
568 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000569
Guido van Rossumd57fd912000-03-10 22:53:23 +0000570 onError:
571 Py_XDECREF(buffer);
572 return NULL;
573}
574
575PyObject *PyUnicode_Encode(const Py_UNICODE *s,
576 int size,
577 const char *encoding,
578 const char *errors)
579{
580 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000581
Guido van Rossumd57fd912000-03-10 22:53:23 +0000582 unicode = PyUnicode_FromUnicode(s, size);
583 if (unicode == NULL)
584 return NULL;
585 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
586 Py_DECREF(unicode);
587 return v;
588}
589
590PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
591 const char *encoding,
592 const char *errors)
593{
594 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596 if (!PyUnicode_Check(unicode)) {
597 PyErr_BadArgument();
598 goto onError;
599 }
Fred Drakee4315f52000-05-09 19:53:39 +0000600
Tim Petersced69f82003-09-16 20:30:58 +0000601 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000602 encoding = PyUnicode_GetDefaultEncoding();
603
604 /* Shortcuts for common default encodings */
605 if (errors == NULL) {
606 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000607 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000608 else if (strcmp(encoding, "latin-1") == 0)
609 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000610#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
611 else if (strcmp(encoding, "mbcs") == 0)
612 return PyUnicode_AsMBCSString(unicode);
613#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000614 else if (strcmp(encoding, "ascii") == 0)
615 return PyUnicode_AsASCIIString(unicode);
616 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617
618 /* Encode via the codec registry */
619 v = PyCodec_Encode(unicode, encoding, errors);
620 if (v == NULL)
621 goto onError;
622 /* XXX Should we really enforce this ? */
623 if (!PyString_Check(v)) {
624 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000625 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 v->ob_type->tp_name);
627 Py_DECREF(v);
628 goto onError;
629 }
630 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000631
Guido van Rossumd57fd912000-03-10 22:53:23 +0000632 onError:
633 return NULL;
634}
635
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000636PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
637 const char *errors)
638{
639 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
640
641 if (v)
642 return v;
643 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
644 if (v && errors == NULL)
645 ((PyUnicodeObject *)unicode)->defenc = v;
646 return v;
647}
648
Guido van Rossumd57fd912000-03-10 22:53:23 +0000649Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
650{
651 if (!PyUnicode_Check(unicode)) {
652 PyErr_BadArgument();
653 goto onError;
654 }
655 return PyUnicode_AS_UNICODE(unicode);
656
657 onError:
658 return NULL;
659}
660
661int PyUnicode_GetSize(PyObject *unicode)
662{
663 if (!PyUnicode_Check(unicode)) {
664 PyErr_BadArgument();
665 goto onError;
666 }
667 return PyUnicode_GET_SIZE(unicode);
668
669 onError:
670 return -1;
671}
672
Thomas Wouters78890102000-07-22 19:25:51 +0000673const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000674{
675 return unicode_default_encoding;
676}
677
678int PyUnicode_SetDefaultEncoding(const char *encoding)
679{
680 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000681
Fred Drakee4315f52000-05-09 19:53:39 +0000682 /* Make sure the encoding is valid. As side effect, this also
683 loads the encoding into the codec registry cache. */
684 v = _PyCodec_Lookup(encoding);
685 if (v == NULL)
686 goto onError;
687 Py_DECREF(v);
688 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000689 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000690 sizeof(unicode_default_encoding));
691 return 0;
692
693 onError:
694 return -1;
695}
696
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000697/* error handling callback helper:
698 build arguments, call the callback and check the arguments,
699 if no exception occured, copy the replacement to the output
700 and adjust various state variables.
701 return 0 on success, -1 on error
702*/
703
704static
705int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
706 const char *encoding, const char *reason,
707 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
708 PyObject **output, int *outpos, Py_UNICODE **outptr)
709{
710 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
711
712 PyObject *restuple = NULL;
713 PyObject *repunicode = NULL;
714 int outsize = PyUnicode_GET_SIZE(*output);
715 int requiredsize;
716 int newpos;
717 Py_UNICODE *repptr;
718 int repsize;
719 int res = -1;
720
721 if (*errorHandler == NULL) {
722 *errorHandler = PyCodec_LookupError(errors);
723 if (*errorHandler == NULL)
724 goto onError;
725 }
726
727 if (*exceptionObject == NULL) {
728 *exceptionObject = PyUnicodeDecodeError_Create(
729 encoding, input, insize, *startinpos, *endinpos, reason);
730 if (*exceptionObject == NULL)
731 goto onError;
732 }
733 else {
734 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
735 goto onError;
736 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
737 goto onError;
738 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
739 goto onError;
740 }
741
742 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
743 if (restuple == NULL)
744 goto onError;
745 if (!PyTuple_Check(restuple)) {
746 PyErr_Format(PyExc_TypeError, &argparse[4]);
747 goto onError;
748 }
749 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
750 goto onError;
751 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000752 newpos = insize+newpos;
753 if (newpos<0 || newpos>insize) {
754 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
755 goto onError;
756 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000757
758 /* need more space? (at least enough for what we
759 have+the replacement+the rest of the string (starting
760 at the new input position), so we won't have to check space
761 when there are no errors in the rest of the string) */
762 repptr = PyUnicode_AS_UNICODE(repunicode);
763 repsize = PyUnicode_GET_SIZE(repunicode);
764 requiredsize = *outpos + repsize + insize-newpos;
765 if (requiredsize > outsize) {
766 if (requiredsize<2*outsize)
767 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000768 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000769 goto onError;
770 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
771 }
772 *endinpos = newpos;
773 *inptr = input + newpos;
774 Py_UNICODE_COPY(*outptr, repptr, repsize);
775 *outptr += repsize;
776 *outpos += repsize;
777 /* we made it! */
778 res = 0;
779
780 onError:
781 Py_XDECREF(restuple);
782 return res;
783}
784
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000785/* --- UTF-7 Codec -------------------------------------------------------- */
786
787/* see RFC2152 for details */
788
Tim Petersced69f82003-09-16 20:30:58 +0000789static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000790char utf7_special[128] = {
791 /* indicate whether a UTF-7 character is special i.e. cannot be directly
792 encoded:
793 0 - not special
794 1 - special
795 2 - whitespace (optional)
796 3 - RFC2152 Set O (optional) */
797 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
798 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
799 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
800 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
801 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
802 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
803 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
804 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
805
806};
807
808#define SPECIAL(c, encodeO, encodeWS) \
809 (((c)>127 || utf7_special[(c)] == 1) || \
810 (encodeWS && (utf7_special[(c)] == 2)) || \
811 (encodeO && (utf7_special[(c)] == 3)))
812
813#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
814#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
815#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
816 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
817
818#define ENCODE(out, ch, bits) \
819 while (bits >= 6) { \
820 *out++ = B64(ch >> (bits-6)); \
821 bits -= 6; \
822 }
823
824#define DECODE(out, ch, bits, surrogate) \
825 while (bits >= 16) { \
826 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
827 bits -= 16; \
828 if (surrogate) { \
829 /* We have already generated an error for the high surrogate
830 so let's not bother seeing if the low surrogate is correct or not */\
831 surrogate = 0; \
832 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
833 /* This is a surrogate pair. Unfortunately we can't represent \
834 it in a 16-bit character */ \
835 surrogate = 1; \
836 errmsg = "code pairs are not supported"; \
837 goto utf7Error; \
838 } else { \
839 *out++ = outCh; \
840 } \
841 } \
842
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000843PyObject *PyUnicode_DecodeUTF7(const char *s,
844 int size,
845 const char *errors)
846{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000847 const char *starts = s;
848 int startinpos;
849 int endinpos;
850 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000851 const char *e;
852 PyUnicodeObject *unicode;
853 Py_UNICODE *p;
854 const char *errmsg = "";
855 int inShift = 0;
856 unsigned int bitsleft = 0;
857 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000858 int surrogate = 0;
859 PyObject *errorHandler = NULL;
860 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000861
862 unicode = _PyUnicode_New(size);
863 if (!unicode)
864 return NULL;
865 if (size == 0)
866 return (PyObject *)unicode;
867
868 p = unicode->str;
869 e = s + size;
870
871 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000872 Py_UNICODE ch;
873 restart:
874 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000875
876 if (inShift) {
877 if ((ch == '-') || !B64CHAR(ch)) {
878 inShift = 0;
879 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000880
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
882 if (bitsleft >= 6) {
883 /* The shift sequence has a partial character in it. If
884 bitsleft < 6 then we could just classify it as padding
885 but that is not the case here */
886
887 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000888 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000889 }
890 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000891 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000892 here so indicate the potential of a misencoded character. */
893
894 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
895 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
896 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000897 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000898 }
899
900 if (ch == '-') {
901 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000902 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000903 inShift = 1;
904 }
905 } else if (SPECIAL(ch,0,0)) {
906 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000907 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000908 } else {
909 *p++ = ch;
910 }
911 } else {
912 charsleft = (charsleft << 6) | UB64(ch);
913 bitsleft += 6;
914 s++;
915 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
916 }
917 }
918 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000919 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000920 s++;
921 if (s < e && *s == '-') {
922 s++;
923 *p++ = '+';
924 } else
925 {
926 inShift = 1;
927 bitsleft = 0;
928 }
929 }
930 else if (SPECIAL(ch,0,0)) {
931 errmsg = "unexpected special character";
932 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000933 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000934 }
935 else {
936 *p++ = ch;
937 s++;
938 }
939 continue;
940 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000941 outpos = p-PyUnicode_AS_UNICODE(unicode);
942 endinpos = s-starts;
943 if (unicode_decode_call_errorhandler(
944 errors, &errorHandler,
945 "utf7", errmsg,
946 starts, size, &startinpos, &endinpos, &exc, &s,
947 (PyObject **)&unicode, &outpos, &p))
948 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 }
950
951 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000952 outpos = p-PyUnicode_AS_UNICODE(unicode);
953 endinpos = size;
954 if (unicode_decode_call_errorhandler(
955 errors, &errorHandler,
956 "utf7", "unterminated shift sequence",
957 starts, size, &startinpos, &endinpos, &exc, &s,
958 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000959 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000960 if (s < e)
961 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000962 }
963
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000964 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000965 goto onError;
966
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000967 Py_XDECREF(errorHandler);
968 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000969 return (PyObject *)unicode;
970
971onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000972 Py_XDECREF(errorHandler);
973 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974 Py_DECREF(unicode);
975 return NULL;
976}
977
978
979PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
980 int size,
981 int encodeSetO,
982 int encodeWhiteSpace,
983 const char *errors)
984{
985 PyObject *v;
986 /* It might be possible to tighten this worst case */
987 unsigned int cbAllocated = 5 * size;
988 int inShift = 0;
989 int i = 0;
990 unsigned int bitsleft = 0;
991 unsigned long charsleft = 0;
992 char * out;
993 char * start;
994
995 if (size == 0)
996 return PyString_FromStringAndSize(NULL, 0);
997
998 v = PyString_FromStringAndSize(NULL, cbAllocated);
999 if (v == NULL)
1000 return NULL;
1001
1002 start = out = PyString_AS_STRING(v);
1003 for (;i < size; ++i) {
1004 Py_UNICODE ch = s[i];
1005
1006 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001007 if (ch == '+') {
1008 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001009 *out++ = '-';
1010 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1011 charsleft = ch;
1012 bitsleft = 16;
1013 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001014 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001015 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001016 } else {
1017 *out++ = (char) ch;
1018 }
1019 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001020 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1021 *out++ = B64(charsleft << (6-bitsleft));
1022 charsleft = 0;
1023 bitsleft = 0;
1024 /* Characters not in the BASE64 set implicitly unshift the sequence
1025 so no '-' is required, except if the character is itself a '-' */
1026 if (B64CHAR(ch) || ch == '-') {
1027 *out++ = '-';
1028 }
1029 inShift = 0;
1030 *out++ = (char) ch;
1031 } else {
1032 bitsleft += 16;
1033 charsleft = (charsleft << 16) | ch;
1034 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1035
1036 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001037 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001038 or '-' then the shift sequence will be terminated implicitly and we
1039 don't have to insert a '-'. */
1040
1041 if (bitsleft == 0) {
1042 if (i + 1 < size) {
1043 Py_UNICODE ch2 = s[i+1];
1044
1045 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001046
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001047 } else if (B64CHAR(ch2) || ch2 == '-') {
1048 *out++ = '-';
1049 inShift = 0;
1050 } else {
1051 inShift = 0;
1052 }
1053
1054 }
1055 else {
1056 *out++ = '-';
1057 inShift = 0;
1058 }
1059 }
Tim Petersced69f82003-09-16 20:30:58 +00001060 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001062 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001063 if (bitsleft) {
1064 *out++= B64(charsleft << (6-bitsleft) );
1065 *out++ = '-';
1066 }
1067
Tim Peters5de98422002-04-27 18:44:32 +00001068 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001069 return v;
1070}
1071
1072#undef SPECIAL
1073#undef B64
1074#undef B64CHAR
1075#undef UB64
1076#undef ENCODE
1077#undef DECODE
1078
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079/* --- UTF-8 Codec -------------------------------------------------------- */
1080
Tim Petersced69f82003-09-16 20:30:58 +00001081static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001082char utf8_code_length[256] = {
1083 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1084 illegal prefix. see RFC 2279 for details */
1085 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1086 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1087 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1088 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1089 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1090 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1091 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1092 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1093 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1094 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1095 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1096 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1097 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1098 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1099 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1100 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1101};
1102
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103PyObject *PyUnicode_DecodeUTF8(const char *s,
1104 int size,
1105 const char *errors)
1106{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001107 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001109 int startinpos;
1110 int endinpos;
1111 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 const char *e;
1113 PyUnicodeObject *unicode;
1114 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001115 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001116 PyObject *errorHandler = NULL;
1117 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001118
1119 /* Note: size will always be longer than the resulting Unicode
1120 character count */
1121 unicode = _PyUnicode_New(size);
1122 if (!unicode)
1123 return NULL;
1124 if (size == 0)
1125 return (PyObject *)unicode;
1126
1127 /* Unpack UTF-8 encoded data */
1128 p = unicode->str;
1129 e = s + size;
1130
1131 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001132 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001133
1134 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001135 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001136 s++;
1137 continue;
1138 }
1139
1140 n = utf8_code_length[ch];
1141
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001142 if (s + n > e) {
1143 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001144 startinpos = s-starts;
1145 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001146 goto utf8Error;
1147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148
1149 switch (n) {
1150
1151 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001152 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001153 startinpos = s-starts;
1154 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001155 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156
1157 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001158 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001159 startinpos = s-starts;
1160 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001161 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162
1163 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001164 if ((s[1] & 0xc0) != 0x80) {
1165 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001166 startinpos = s-starts;
1167 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001168 goto utf8Error;
1169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001171 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001172 startinpos = s-starts;
1173 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001174 errmsg = "illegal encoding";
1175 goto utf8Error;
1176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001178 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 break;
1180
1181 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001182 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001183 (s[2] & 0xc0) != 0x80) {
1184 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001185 startinpos = s-starts;
1186 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001187 goto utf8Error;
1188 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001190 if (ch < 0x0800) {
1191 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001192 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001193
1194 XXX For wide builds (UCS-4) we should probably try
1195 to recombine the surrogates into a single code
1196 unit.
1197 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001198 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001199 startinpos = s-starts;
1200 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001201 goto utf8Error;
1202 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001204 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001205 break;
1206
1207 case 4:
1208 if ((s[1] & 0xc0) != 0x80 ||
1209 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001210 (s[3] & 0xc0) != 0x80) {
1211 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001212 startinpos = s-starts;
1213 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001214 goto utf8Error;
1215 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001216 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1217 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1218 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001219 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001220 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001221 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001222 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001223 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001224 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001225 startinpos = s-starts;
1226 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001227 goto utf8Error;
1228 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001229#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001230 *p++ = (Py_UNICODE)ch;
1231#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001232 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001233
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001234 /* translate from 10000..10FFFF to 0..FFFF */
1235 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001236
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001237 /* high surrogate = top 10 bits added to D800 */
1238 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001239
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001240 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001241 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001242#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 break;
1244
1245 default:
1246 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001247 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001248 startinpos = s-starts;
1249 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001250 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 }
1252 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001253 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001254
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001255 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001256 outpos = p-PyUnicode_AS_UNICODE(unicode);
1257 if (unicode_decode_call_errorhandler(
1258 errors, &errorHandler,
1259 "utf8", errmsg,
1260 starts, size, &startinpos, &endinpos, &exc, &s,
1261 (PyObject **)&unicode, &outpos, &p))
1262 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001263 }
1264
1265 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001266 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267 goto onError;
1268
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001269 Py_XDECREF(errorHandler);
1270 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001271 return (PyObject *)unicode;
1272
1273onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001274 Py_XDECREF(errorHandler);
1275 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 Py_DECREF(unicode);
1277 return NULL;
1278}
1279
Tim Peters602f7402002-04-27 18:03:26 +00001280/* Allocation strategy: if the string is short, convert into a stack buffer
1281 and allocate exactly as much space needed at the end. Else allocate the
1282 maximum possible needed (4 result bytes per Unicode character), and return
1283 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001284*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001285PyObject *
1286PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1287 int size,
1288 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289{
Tim Peters602f7402002-04-27 18:03:26 +00001290#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001291
Tim Peters602f7402002-04-27 18:03:26 +00001292 int i; /* index into s of next input byte */
1293 PyObject *v; /* result string object */
1294 char *p; /* next free byte in output buffer */
1295 int nallocated; /* number of result bytes allocated */
1296 int nneeded; /* number of result bytes needed */
1297 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001298
Tim Peters602f7402002-04-27 18:03:26 +00001299 assert(s != NULL);
1300 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001301
Tim Peters602f7402002-04-27 18:03:26 +00001302 if (size <= MAX_SHORT_UNICHARS) {
1303 /* Write into the stack buffer; nallocated can't overflow.
1304 * At the end, we'll allocate exactly as much heap space as it
1305 * turns out we need.
1306 */
1307 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1308 v = NULL; /* will allocate after we're done */
1309 p = stackbuf;
1310 }
1311 else {
1312 /* Overallocate on the heap, and give the excess back at the end. */
1313 nallocated = size * 4;
1314 if (nallocated / 4 != size) /* overflow! */
1315 return PyErr_NoMemory();
1316 v = PyString_FromStringAndSize(NULL, nallocated);
1317 if (v == NULL)
1318 return NULL;
1319 p = PyString_AS_STRING(v);
1320 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001321
Tim Peters602f7402002-04-27 18:03:26 +00001322 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001323 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001324
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001325 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001326 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001328
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001330 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001331 *p++ = (char)(0xc0 | (ch >> 6));
1332 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001333 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001334 else {
Tim Peters602f7402002-04-27 18:03:26 +00001335 /* Encode UCS2 Unicode ordinals */
1336 if (ch < 0x10000) {
1337 /* Special case: check for high surrogate */
1338 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1339 Py_UCS4 ch2 = s[i];
1340 /* Check for low surrogate and combine the two to
1341 form a UCS4 value */
1342 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001343 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001344 i++;
1345 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001346 }
Tim Peters602f7402002-04-27 18:03:26 +00001347 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001349 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001350 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1351 *p++ = (char)(0x80 | (ch & 0x3f));
1352 continue;
1353 }
1354encodeUCS4:
1355 /* Encode UCS4 Unicode ordinals */
1356 *p++ = (char)(0xf0 | (ch >> 18));
1357 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1358 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1359 *p++ = (char)(0x80 | (ch & 0x3f));
1360 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001362
Tim Peters602f7402002-04-27 18:03:26 +00001363 if (v == NULL) {
1364 /* This was stack allocated. */
1365 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1366 assert(nneeded <= nallocated);
1367 v = PyString_FromStringAndSize(stackbuf, nneeded);
1368 }
1369 else {
1370 /* Cut back to size actually needed. */
1371 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1372 assert(nneeded <= nallocated);
1373 _PyString_Resize(&v, nneeded);
1374 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001375 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001376
Tim Peters602f7402002-04-27 18:03:26 +00001377#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378}
1379
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1381{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 if (!PyUnicode_Check(unicode)) {
1383 PyErr_BadArgument();
1384 return NULL;
1385 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001386 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1387 PyUnicode_GET_SIZE(unicode),
1388 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389}
1390
1391/* --- UTF-16 Codec ------------------------------------------------------- */
1392
Tim Peters772747b2001-08-09 22:21:55 +00001393PyObject *
1394PyUnicode_DecodeUTF16(const char *s,
1395 int size,
1396 const char *errors,
1397 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001399 const char *starts = s;
1400 int startinpos;
1401 int endinpos;
1402 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001403 PyUnicodeObject *unicode;
1404 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001405 const unsigned char *q, *e;
1406 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001407 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001408 /* Offsets from q for retrieving byte pairs in the right order. */
1409#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1410 int ihi = 1, ilo = 0;
1411#else
1412 int ihi = 0, ilo = 1;
1413#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414 PyObject *errorHandler = NULL;
1415 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001416
1417 /* Note: size will always be longer than the resulting Unicode
1418 character count */
1419 unicode = _PyUnicode_New(size);
1420 if (!unicode)
1421 return NULL;
1422 if (size == 0)
1423 return (PyObject *)unicode;
1424
1425 /* Unpack UTF-16 encoded data */
1426 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001427 q = (unsigned char *)s;
1428 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429
1430 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001431 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001433 /* Check for BOM marks (U+FEFF) in the input and adjust current
1434 byte order setting accordingly. In native mode, the leading BOM
1435 mark is skipped, in all other modes, it is copied to the output
1436 stream as-is (giving a ZWNBSP character). */
1437 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001438 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001439#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001440 if (bom == 0xFEFF) {
1441 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001442 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001443 }
1444 else if (bom == 0xFFFE) {
1445 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001446 bo = 1;
1447 }
Tim Petersced69f82003-09-16 20:30:58 +00001448#else
Tim Peters772747b2001-08-09 22:21:55 +00001449 if (bom == 0xFEFF) {
1450 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001451 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001452 }
1453 else if (bom == 0xFFFE) {
1454 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001455 bo = -1;
1456 }
1457#endif
1458 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459
Tim Peters772747b2001-08-09 22:21:55 +00001460 if (bo == -1) {
1461 /* force LE */
1462 ihi = 1;
1463 ilo = 0;
1464 }
1465 else if (bo == 1) {
1466 /* force BE */
1467 ihi = 0;
1468 ilo = 1;
1469 }
1470
1471 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001472 Py_UNICODE ch;
1473 /* remaing bytes at the end? (size should be even) */
1474 if (e-q<2) {
1475 errmsg = "truncated data";
1476 startinpos = ((const char *)q)-starts;
1477 endinpos = ((const char *)e)-starts;
1478 goto utf16Error;
1479 /* The remaining input chars are ignored if the callback
1480 chooses to skip the input */
1481 }
1482 ch = (q[ihi] << 8) | q[ilo];
1483
Tim Peters772747b2001-08-09 22:21:55 +00001484 q += 2;
1485
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486 if (ch < 0xD800 || ch > 0xDFFF) {
1487 *p++ = ch;
1488 continue;
1489 }
1490
1491 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001492 if (q >= e) {
1493 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001494 startinpos = (((const char *)q)-2)-starts;
1495 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001496 goto utf16Error;
1497 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001498 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001499 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1500 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001501 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001502#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001503 *p++ = ch;
1504 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001505#else
1506 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001507#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001508 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001509 }
1510 else {
1511 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001512 startinpos = (((const char *)q)-4)-starts;
1513 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001514 goto utf16Error;
1515 }
1516
Guido van Rossumd57fd912000-03-10 22:53:23 +00001517 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001518 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001519 startinpos = (((const char *)q)-2)-starts;
1520 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001521 /* Fall through to report the error */
1522
1523 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001524 outpos = p-PyUnicode_AS_UNICODE(unicode);
1525 if (unicode_decode_call_errorhandler(
1526 errors, &errorHandler,
1527 "utf16", errmsg,
1528 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1529 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001530 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001531 }
1532
1533 if (byteorder)
1534 *byteorder = bo;
1535
1536 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001537 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001538 goto onError;
1539
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 Py_XDECREF(errorHandler);
1541 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 return (PyObject *)unicode;
1543
1544onError:
1545 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001546 Py_XDECREF(errorHandler);
1547 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001548 return NULL;
1549}
1550
Tim Peters772747b2001-08-09 22:21:55 +00001551PyObject *
1552PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1553 int size,
1554 const char *errors,
1555 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001556{
1557 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001558 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001559#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001560 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001561#else
1562 const int pairs = 0;
1563#endif
Tim Peters772747b2001-08-09 22:21:55 +00001564 /* Offsets from p for storing byte pairs in the right order. */
1565#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1566 int ihi = 1, ilo = 0;
1567#else
1568 int ihi = 0, ilo = 1;
1569#endif
1570
1571#define STORECHAR(CH) \
1572 do { \
1573 p[ihi] = ((CH) >> 8) & 0xff; \
1574 p[ilo] = (CH) & 0xff; \
1575 p += 2; \
1576 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001577
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001578#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001579 for (i = pairs = 0; i < size; i++)
1580 if (s[i] >= 0x10000)
1581 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001582#endif
Tim Petersced69f82003-09-16 20:30:58 +00001583 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001584 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001585 if (v == NULL)
1586 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587
Tim Peters772747b2001-08-09 22:21:55 +00001588 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001589 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001590 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001591 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001592 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001593
1594 if (byteorder == -1) {
1595 /* force LE */
1596 ihi = 1;
1597 ilo = 0;
1598 }
1599 else if (byteorder == 1) {
1600 /* force BE */
1601 ihi = 0;
1602 ilo = 1;
1603 }
1604
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001605 while (size-- > 0) {
1606 Py_UNICODE ch = *s++;
1607 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001608#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001609 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001610 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1611 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001612 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001613#endif
Tim Peters772747b2001-08-09 22:21:55 +00001614 STORECHAR(ch);
1615 if (ch2)
1616 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001617 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001619#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620}
1621
1622PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1623{
1624 if (!PyUnicode_Check(unicode)) {
1625 PyErr_BadArgument();
1626 return NULL;
1627 }
1628 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1629 PyUnicode_GET_SIZE(unicode),
1630 NULL,
1631 0);
1632}
1633
1634/* --- Unicode Escape Codec ----------------------------------------------- */
1635
Fredrik Lundh06d12682001-01-24 07:59:11 +00001636static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001637
Guido van Rossumd57fd912000-03-10 22:53:23 +00001638PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1639 int size,
1640 const char *errors)
1641{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 const char *starts = s;
1643 int startinpos;
1644 int endinpos;
1645 int outpos;
1646 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001648 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001649 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001650 char* message;
1651 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001652 PyObject *errorHandler = NULL;
1653 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001654
Guido van Rossumd57fd912000-03-10 22:53:23 +00001655 /* Escaped strings will always be longer than the resulting
1656 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001657 length after conversion to the true value.
1658 (but if the error callback returns a long replacement string
1659 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001660 v = _PyUnicode_New(size);
1661 if (v == NULL)
1662 goto onError;
1663 if (size == 0)
1664 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001665
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001666 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001667 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001668
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 while (s < end) {
1670 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001671 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001672 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673
1674 /* Non-escape characters are interpreted as Unicode ordinals */
1675 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001676 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001677 continue;
1678 }
1679
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001680 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 /* \ - Escapes */
1682 s++;
1683 switch (*s++) {
1684
1685 /* \x escapes */
1686 case '\n': break;
1687 case '\\': *p++ = '\\'; break;
1688 case '\'': *p++ = '\''; break;
1689 case '\"': *p++ = '\"'; break;
1690 case 'b': *p++ = '\b'; break;
1691 case 'f': *p++ = '\014'; break; /* FF */
1692 case 't': *p++ = '\t'; break;
1693 case 'n': *p++ = '\n'; break;
1694 case 'r': *p++ = '\r'; break;
1695 case 'v': *p++ = '\013'; break; /* VT */
1696 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1697
1698 /* \OOO (octal) escapes */
1699 case '0': case '1': case '2': case '3':
1700 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001701 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001703 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001705 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001706 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001707 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708 break;
1709
Fredrik Lundhccc74732001-02-18 22:13:49 +00001710 /* hex escapes */
1711 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001712 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001713 digits = 2;
1714 message = "truncated \\xXX escape";
1715 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716
Fredrik Lundhccc74732001-02-18 22:13:49 +00001717 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001719 digits = 4;
1720 message = "truncated \\uXXXX escape";
1721 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722
Fredrik Lundhccc74732001-02-18 22:13:49 +00001723 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001724 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001725 digits = 8;
1726 message = "truncated \\UXXXXXXXX escape";
1727 hexescape:
1728 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001729 outpos = p-PyUnicode_AS_UNICODE(v);
1730 if (s+digits>end) {
1731 endinpos = size;
1732 if (unicode_decode_call_errorhandler(
1733 errors, &errorHandler,
1734 "unicodeescape", "end of string in escape sequence",
1735 starts, size, &startinpos, &endinpos, &exc, &s,
1736 (PyObject **)&v, &outpos, &p))
1737 goto onError;
1738 goto nextByte;
1739 }
1740 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001741 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001742 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001743 endinpos = (s+i+1)-starts;
1744 if (unicode_decode_call_errorhandler(
1745 errors, &errorHandler,
1746 "unicodeescape", message,
1747 starts, size, &startinpos, &endinpos, &exc, &s,
1748 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001749 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001750 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001751 }
1752 chr = (chr<<4) & ~0xF;
1753 if (c >= '0' && c <= '9')
1754 chr += c - '0';
1755 else if (c >= 'a' && c <= 'f')
1756 chr += 10 + c - 'a';
1757 else
1758 chr += 10 + c - 'A';
1759 }
1760 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001761 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001762 /* _decoding_error will have already written into the
1763 target buffer. */
1764 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001765 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001766 /* when we get here, chr is a 32-bit unicode character */
1767 if (chr <= 0xffff)
1768 /* UCS-2 character */
1769 *p++ = (Py_UNICODE) chr;
1770 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001771 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001772 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001773#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001774 *p++ = chr;
1775#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001776 chr -= 0x10000L;
1777 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001778 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001779#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001780 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001781 endinpos = s-starts;
1782 outpos = p-PyUnicode_AS_UNICODE(v);
1783 if (unicode_decode_call_errorhandler(
1784 errors, &errorHandler,
1785 "unicodeescape", "illegal Unicode character",
1786 starts, size, &startinpos, &endinpos, &exc, &s,
1787 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001788 goto onError;
1789 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001790 break;
1791
1792 /* \N{name} */
1793 case 'N':
1794 message = "malformed \\N character escape";
1795 if (ucnhash_CAPI == NULL) {
1796 /* load the unicode data module */
1797 PyObject *m, *v;
1798 m = PyImport_ImportModule("unicodedata");
1799 if (m == NULL)
1800 goto ucnhashError;
1801 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1802 Py_DECREF(m);
1803 if (v == NULL)
1804 goto ucnhashError;
1805 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1806 Py_DECREF(v);
1807 if (ucnhash_CAPI == NULL)
1808 goto ucnhashError;
1809 }
1810 if (*s == '{') {
1811 const char *start = s+1;
1812 /* look for the closing brace */
1813 while (*s != '}' && s < end)
1814 s++;
1815 if (s > start && s < end && *s == '}') {
1816 /* found a name. look it up in the unicode database */
1817 message = "unknown Unicode character name";
1818 s++;
1819 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1820 goto store;
1821 }
1822 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001823 endinpos = s-starts;
1824 outpos = p-PyUnicode_AS_UNICODE(v);
1825 if (unicode_decode_call_errorhandler(
1826 errors, &errorHandler,
1827 "unicodeescape", message,
1828 starts, size, &startinpos, &endinpos, &exc, &s,
1829 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001830 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001831 break;
1832
1833 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001834 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001835 message = "\\ at end of string";
1836 s--;
1837 endinpos = s-starts;
1838 outpos = p-PyUnicode_AS_UNICODE(v);
1839 if (unicode_decode_call_errorhandler(
1840 errors, &errorHandler,
1841 "unicodeescape", message,
1842 starts, size, &startinpos, &endinpos, &exc, &s,
1843 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001844 goto onError;
1845 }
1846 else {
1847 *p++ = '\\';
1848 *p++ = (unsigned char)s[-1];
1849 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001850 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001852 nextByte:
1853 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001855 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001856 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001857 Py_XDECREF(errorHandler);
1858 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001859 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001860
Fredrik Lundhccc74732001-02-18 22:13:49 +00001861ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001862 PyErr_SetString(
1863 PyExc_UnicodeError,
1864 "\\N escapes not supported (can't load unicodedata module)"
1865 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001866 Py_XDECREF(errorHandler);
1867 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001868 return NULL;
1869
Fredrik Lundhccc74732001-02-18 22:13:49 +00001870onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001872 Py_XDECREF(errorHandler);
1873 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001874 return NULL;
1875}
1876
1877/* Return a Unicode-Escape string version of the Unicode object.
1878
1879 If quotes is true, the string is enclosed in u"" or u'' quotes as
1880 appropriate.
1881
1882*/
1883
Barry Warsaw51ac5802000-03-20 16:36:48 +00001884static const Py_UNICODE *findchar(const Py_UNICODE *s,
1885 int size,
1886 Py_UNICODE ch);
1887
Guido van Rossumd57fd912000-03-10 22:53:23 +00001888static
1889PyObject *unicodeescape_string(const Py_UNICODE *s,
1890 int size,
1891 int quotes)
1892{
1893 PyObject *repr;
1894 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001896 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001897
1898 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1899 if (repr == NULL)
1900 return NULL;
1901
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001902 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001903
1904 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001906 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001907 !findchar(s, size, '"')) ? '"' : '\'';
1908 }
1909 while (size-- > 0) {
1910 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001911
Guido van Rossumd57fd912000-03-10 22:53:23 +00001912 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001913 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001914 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915 *p++ = '\\';
1916 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001917 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001918 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001919
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001920#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001921 /* Map 21-bit characters to '\U00xxxxxx' */
1922 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001923 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00001924
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001925 /* Resize the string if necessary */
1926 if (offset + 12 > PyString_GET_SIZE(repr)) {
1927 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001928 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001929 p = PyString_AS_STRING(repr) + offset;
1930 }
1931
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001932 *p++ = '\\';
1933 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001934 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1935 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1936 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1937 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1938 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1939 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1940 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001941 *p++ = hexdigit[ch & 0x0000000F];
1942 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001943 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001944#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001945 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1946 else if (ch >= 0xD800 && ch < 0xDC00) {
1947 Py_UNICODE ch2;
1948 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00001949
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001950 ch2 = *s++;
1951 size--;
1952 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1953 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1954 *p++ = '\\';
1955 *p++ = 'U';
1956 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1957 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1958 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1959 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1960 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1961 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1962 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1963 *p++ = hexdigit[ucs & 0x0000000F];
1964 continue;
1965 }
1966 /* Fall through: isolated surrogates are copied as-is */
1967 s--;
1968 size++;
1969 }
1970
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001972 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973 *p++ = '\\';
1974 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001975 *p++ = hexdigit[(ch >> 12) & 0x000F];
1976 *p++ = hexdigit[(ch >> 8) & 0x000F];
1977 *p++ = hexdigit[(ch >> 4) & 0x000F];
1978 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001980
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001981 /* Map special whitespace to '\t', \n', '\r' */
1982 else if (ch == '\t') {
1983 *p++ = '\\';
1984 *p++ = 't';
1985 }
1986 else if (ch == '\n') {
1987 *p++ = '\\';
1988 *p++ = 'n';
1989 }
1990 else if (ch == '\r') {
1991 *p++ = '\\';
1992 *p++ = 'r';
1993 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001994
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001995 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001996 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001998 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001999 *p++ = hexdigit[(ch >> 4) & 0x000F];
2000 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002001 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002002
Guido van Rossumd57fd912000-03-10 22:53:23 +00002003 /* Copy everything else as-is */
2004 else
2005 *p++ = (char) ch;
2006 }
2007 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002008 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002009
2010 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002011 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002012 return repr;
2013}
2014
2015PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2016 int size)
2017{
2018 return unicodeescape_string(s, size, 0);
2019}
2020
2021PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2022{
2023 if (!PyUnicode_Check(unicode)) {
2024 PyErr_BadArgument();
2025 return NULL;
2026 }
2027 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2028 PyUnicode_GET_SIZE(unicode));
2029}
2030
2031/* --- Raw Unicode Escape Codec ------------------------------------------- */
2032
2033PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2034 int size,
2035 const char *errors)
2036{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002037 const char *starts = s;
2038 int startinpos;
2039 int endinpos;
2040 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002042 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043 const char *end;
2044 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002045 PyObject *errorHandler = NULL;
2046 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002047
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 /* Escaped strings will always be longer than the resulting
2049 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002050 length after conversion to the true value. (But decoding error
2051 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052 v = _PyUnicode_New(size);
2053 if (v == NULL)
2054 goto onError;
2055 if (size == 0)
2056 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002057 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 end = s + size;
2059 while (s < end) {
2060 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002061 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002062 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002063 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064
2065 /* Non-escape characters are interpreted as Unicode ordinals */
2066 if (*s != '\\') {
2067 *p++ = (unsigned char)*s++;
2068 continue;
2069 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002070 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071
2072 /* \u-escapes are only interpreted iff the number of leading
2073 backslashes if odd */
2074 bs = s;
2075 for (;s < end;) {
2076 if (*s != '\\')
2077 break;
2078 *p++ = (unsigned char)*s++;
2079 }
2080 if (((s - bs) & 1) == 0 ||
2081 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002082 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002083 continue;
2084 }
2085 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002086 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087 s++;
2088
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002089 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002090 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002091 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002092 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002094 endinpos = s-starts;
2095 if (unicode_decode_call_errorhandler(
2096 errors, &errorHandler,
2097 "rawunicodeescape", "truncated \\uXXXX",
2098 starts, size, &startinpos, &endinpos, &exc, &s,
2099 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002101 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002102 }
2103 x = (x<<4) & ~0xF;
2104 if (c >= '0' && c <= '9')
2105 x += c - '0';
2106 else if (c >= 'a' && c <= 'f')
2107 x += 10 + c - 'a';
2108 else
2109 x += 10 + c - 'A';
2110 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002111#ifndef Py_UNICODE_WIDE
2112 if (x > 0x10000) {
2113 if (unicode_decode_call_errorhandler(
2114 errors, &errorHandler,
2115 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2116 starts, size, &startinpos, &endinpos, &exc, &s,
2117 (PyObject **)&v, &outpos, &p))
2118 goto onError;
2119 }
2120#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002121 *p++ = x;
2122 nextByte:
2123 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002125 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002126 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002127 Py_XDECREF(errorHandler);
2128 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002130
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131 onError:
2132 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002133 Py_XDECREF(errorHandler);
2134 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 return NULL;
2136}
2137
2138PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2139 int size)
2140{
2141 PyObject *repr;
2142 char *p;
2143 char *q;
2144
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002145 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002147#ifdef Py_UNICODE_WIDE
2148 repr = PyString_FromStringAndSize(NULL, 10 * size);
2149#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002150 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002151#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152 if (repr == NULL)
2153 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002154 if (size == 0)
2155 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156
2157 p = q = PyString_AS_STRING(repr);
2158 while (size-- > 0) {
2159 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002160#ifdef Py_UNICODE_WIDE
2161 /* Map 32-bit characters to '\Uxxxxxxxx' */
2162 if (ch >= 0x10000) {
2163 *p++ = '\\';
2164 *p++ = 'U';
2165 *p++ = hexdigit[(ch >> 28) & 0xf];
2166 *p++ = hexdigit[(ch >> 24) & 0xf];
2167 *p++ = hexdigit[(ch >> 20) & 0xf];
2168 *p++ = hexdigit[(ch >> 16) & 0xf];
2169 *p++ = hexdigit[(ch >> 12) & 0xf];
2170 *p++ = hexdigit[(ch >> 8) & 0xf];
2171 *p++ = hexdigit[(ch >> 4) & 0xf];
2172 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002173 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002174 else
2175#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176 /* Map 16-bit characters to '\uxxxx' */
2177 if (ch >= 256) {
2178 *p++ = '\\';
2179 *p++ = 'u';
2180 *p++ = hexdigit[(ch >> 12) & 0xf];
2181 *p++ = hexdigit[(ch >> 8) & 0xf];
2182 *p++ = hexdigit[(ch >> 4) & 0xf];
2183 *p++ = hexdigit[ch & 15];
2184 }
2185 /* Copy everything else as-is */
2186 else
2187 *p++ = (char) ch;
2188 }
2189 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002190 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191 return repr;
2192}
2193
2194PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2195{
2196 if (!PyUnicode_Check(unicode)) {
2197 PyErr_BadArgument();
2198 return NULL;
2199 }
2200 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2201 PyUnicode_GET_SIZE(unicode));
2202}
2203
2204/* --- Latin-1 Codec ------------------------------------------------------ */
2205
2206PyObject *PyUnicode_DecodeLatin1(const char *s,
2207 int size,
2208 const char *errors)
2209{
2210 PyUnicodeObject *v;
2211 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002212
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002214 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002215 Py_UNICODE r = *(unsigned char*)s;
2216 return PyUnicode_FromUnicode(&r, 1);
2217 }
2218
Guido van Rossumd57fd912000-03-10 22:53:23 +00002219 v = _PyUnicode_New(size);
2220 if (v == NULL)
2221 goto onError;
2222 if (size == 0)
2223 return (PyObject *)v;
2224 p = PyUnicode_AS_UNICODE(v);
2225 while (size-- > 0)
2226 *p++ = (unsigned char)*s++;
2227 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002228
Guido van Rossumd57fd912000-03-10 22:53:23 +00002229 onError:
2230 Py_XDECREF(v);
2231 return NULL;
2232}
2233
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002234/* create or adjust a UnicodeEncodeError */
2235static void make_encode_exception(PyObject **exceptionObject,
2236 const char *encoding,
2237 const Py_UNICODE *unicode, int size,
2238 int startpos, int endpos,
2239 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002240{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002241 if (*exceptionObject == NULL) {
2242 *exceptionObject = PyUnicodeEncodeError_Create(
2243 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244 }
2245 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002246 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2247 goto onError;
2248 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2249 goto onError;
2250 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2251 goto onError;
2252 return;
2253 onError:
2254 Py_DECREF(*exceptionObject);
2255 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002256 }
2257}
2258
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002259/* raises a UnicodeEncodeError */
2260static void raise_encode_exception(PyObject **exceptionObject,
2261 const char *encoding,
2262 const Py_UNICODE *unicode, int size,
2263 int startpos, int endpos,
2264 const char *reason)
2265{
2266 make_encode_exception(exceptionObject,
2267 encoding, unicode, size, startpos, endpos, reason);
2268 if (*exceptionObject != NULL)
2269 PyCodec_StrictErrors(*exceptionObject);
2270}
2271
2272/* error handling callback helper:
2273 build arguments, call the callback and check the arguments,
2274 put the result into newpos and return the replacement string, which
2275 has to be freed by the caller */
2276static PyObject *unicode_encode_call_errorhandler(const char *errors,
2277 PyObject **errorHandler,
2278 const char *encoding, const char *reason,
2279 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2280 int startpos, int endpos,
2281 int *newpos)
2282{
2283 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2284
2285 PyObject *restuple;
2286 PyObject *resunicode;
2287
2288 if (*errorHandler == NULL) {
2289 *errorHandler = PyCodec_LookupError(errors);
2290 if (*errorHandler == NULL)
2291 return NULL;
2292 }
2293
2294 make_encode_exception(exceptionObject,
2295 encoding, unicode, size, startpos, endpos, reason);
2296 if (*exceptionObject == NULL)
2297 return NULL;
2298
2299 restuple = PyObject_CallFunctionObjArgs(
2300 *errorHandler, *exceptionObject, NULL);
2301 if (restuple == NULL)
2302 return NULL;
2303 if (!PyTuple_Check(restuple)) {
2304 PyErr_Format(PyExc_TypeError, &argparse[4]);
2305 Py_DECREF(restuple);
2306 return NULL;
2307 }
2308 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2309 &resunicode, newpos)) {
2310 Py_DECREF(restuple);
2311 return NULL;
2312 }
2313 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002314 *newpos = size+*newpos;
2315 if (*newpos<0 || *newpos>size) {
2316 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2317 Py_DECREF(restuple);
2318 return NULL;
2319 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002320 Py_INCREF(resunicode);
2321 Py_DECREF(restuple);
2322 return resunicode;
2323}
2324
2325static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2326 int size,
2327 const char *errors,
2328 int limit)
2329{
2330 /* output object */
2331 PyObject *res;
2332 /* pointers to the beginning and end+1 of input */
2333 const Py_UNICODE *startp = p;
2334 const Py_UNICODE *endp = p + size;
2335 /* pointer to the beginning of the unencodable characters */
2336 /* const Py_UNICODE *badp = NULL; */
2337 /* pointer into the output */
2338 char *str;
2339 /* current output position */
2340 int respos = 0;
2341 int ressize;
2342 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2343 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2344 PyObject *errorHandler = NULL;
2345 PyObject *exc = NULL;
2346 /* the following variable is used for caching string comparisons
2347 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2348 int known_errorHandler = -1;
2349
2350 /* allocate enough for a simple encoding without
2351 replacements, if we need more, we'll resize */
2352 res = PyString_FromStringAndSize(NULL, size);
2353 if (res == NULL)
2354 goto onError;
2355 if (size == 0)
2356 return res;
2357 str = PyString_AS_STRING(res);
2358 ressize = size;
2359
2360 while (p<endp) {
2361 Py_UNICODE c = *p;
2362
2363 /* can we encode this? */
2364 if (c<limit) {
2365 /* no overflow check, because we know that the space is enough */
2366 *str++ = (char)c;
2367 ++p;
2368 }
2369 else {
2370 int unicodepos = p-startp;
2371 int requiredsize;
2372 PyObject *repunicode;
2373 int repsize;
2374 int newpos;
2375 int respos;
2376 Py_UNICODE *uni2;
2377 /* startpos for collecting unencodable chars */
2378 const Py_UNICODE *collstart = p;
2379 const Py_UNICODE *collend = p;
2380 /* find all unecodable characters */
2381 while ((collend < endp) && ((*collend)>=limit))
2382 ++collend;
2383 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2384 if (known_errorHandler==-1) {
2385 if ((errors==NULL) || (!strcmp(errors, "strict")))
2386 known_errorHandler = 1;
2387 else if (!strcmp(errors, "replace"))
2388 known_errorHandler = 2;
2389 else if (!strcmp(errors, "ignore"))
2390 known_errorHandler = 3;
2391 else if (!strcmp(errors, "xmlcharrefreplace"))
2392 known_errorHandler = 4;
2393 else
2394 known_errorHandler = 0;
2395 }
2396 switch (known_errorHandler) {
2397 case 1: /* strict */
2398 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2399 goto onError;
2400 case 2: /* replace */
2401 while (collstart++<collend)
2402 *str++ = '?'; /* fall through */
2403 case 3: /* ignore */
2404 p = collend;
2405 break;
2406 case 4: /* xmlcharrefreplace */
2407 respos = str-PyString_AS_STRING(res);
2408 /* determine replacement size (temporarily (mis)uses p) */
2409 for (p = collstart, repsize = 0; p < collend; ++p) {
2410 if (*p<10)
2411 repsize += 2+1+1;
2412 else if (*p<100)
2413 repsize += 2+2+1;
2414 else if (*p<1000)
2415 repsize += 2+3+1;
2416 else if (*p<10000)
2417 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002418#ifndef Py_UNICODE_WIDE
2419 else
2420 repsize += 2+5+1;
2421#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002422 else if (*p<100000)
2423 repsize += 2+5+1;
2424 else if (*p<1000000)
2425 repsize += 2+6+1;
2426 else
2427 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002428#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002429 }
2430 requiredsize = respos+repsize+(endp-collend);
2431 if (requiredsize > ressize) {
2432 if (requiredsize<2*ressize)
2433 requiredsize = 2*ressize;
2434 if (_PyString_Resize(&res, requiredsize))
2435 goto onError;
2436 str = PyString_AS_STRING(res) + respos;
2437 ressize = requiredsize;
2438 }
2439 /* generate replacement (temporarily (mis)uses p) */
2440 for (p = collstart; p < collend; ++p) {
2441 str += sprintf(str, "&#%d;", (int)*p);
2442 }
2443 p = collend;
2444 break;
2445 default:
2446 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2447 encoding, reason, startp, size, &exc,
2448 collstart-startp, collend-startp, &newpos);
2449 if (repunicode == NULL)
2450 goto onError;
2451 /* need more space? (at least enough for what we
2452 have+the replacement+the rest of the string, so
2453 we won't have to check space for encodable characters) */
2454 respos = str-PyString_AS_STRING(res);
2455 repsize = PyUnicode_GET_SIZE(repunicode);
2456 requiredsize = respos+repsize+(endp-collend);
2457 if (requiredsize > ressize) {
2458 if (requiredsize<2*ressize)
2459 requiredsize = 2*ressize;
2460 if (_PyString_Resize(&res, requiredsize)) {
2461 Py_DECREF(repunicode);
2462 goto onError;
2463 }
2464 str = PyString_AS_STRING(res) + respos;
2465 ressize = requiredsize;
2466 }
2467 /* check if there is anything unencodable in the replacement
2468 and copy it to the output */
2469 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2470 c = *uni2;
2471 if (c >= limit) {
2472 raise_encode_exception(&exc, encoding, startp, size,
2473 unicodepos, unicodepos+1, reason);
2474 Py_DECREF(repunicode);
2475 goto onError;
2476 }
2477 *str = (char)c;
2478 }
2479 p = startp + newpos;
2480 Py_DECREF(repunicode);
2481 }
2482 }
2483 }
2484 /* Resize if we allocated to much */
2485 respos = str-PyString_AS_STRING(res);
2486 if (respos<ressize)
2487 /* If this falls res will be NULL */
2488 _PyString_Resize(&res, respos);
2489 Py_XDECREF(errorHandler);
2490 Py_XDECREF(exc);
2491 return res;
2492
2493 onError:
2494 Py_XDECREF(res);
2495 Py_XDECREF(errorHandler);
2496 Py_XDECREF(exc);
2497 return NULL;
2498}
2499
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2501 int size,
2502 const char *errors)
2503{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002504 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505}
2506
2507PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2508{
2509 if (!PyUnicode_Check(unicode)) {
2510 PyErr_BadArgument();
2511 return NULL;
2512 }
2513 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2514 PyUnicode_GET_SIZE(unicode),
2515 NULL);
2516}
2517
2518/* --- 7-bit ASCII Codec -------------------------------------------------- */
2519
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520PyObject *PyUnicode_DecodeASCII(const char *s,
2521 int size,
2522 const char *errors)
2523{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002524 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525 PyUnicodeObject *v;
2526 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002527 int startinpos;
2528 int endinpos;
2529 int outpos;
2530 const char *e;
2531 PyObject *errorHandler = NULL;
2532 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002533
Guido van Rossumd57fd912000-03-10 22:53:23 +00002534 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002535 if (size == 1 && *(unsigned char*)s < 128) {
2536 Py_UNICODE r = *(unsigned char*)s;
2537 return PyUnicode_FromUnicode(&r, 1);
2538 }
Tim Petersced69f82003-09-16 20:30:58 +00002539
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540 v = _PyUnicode_New(size);
2541 if (v == NULL)
2542 goto onError;
2543 if (size == 0)
2544 return (PyObject *)v;
2545 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002546 e = s + size;
2547 while (s < e) {
2548 register unsigned char c = (unsigned char)*s;
2549 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002551 ++s;
2552 }
2553 else {
2554 startinpos = s-starts;
2555 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002556 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002557 if (unicode_decode_call_errorhandler(
2558 errors, &errorHandler,
2559 "ascii", "ordinal not in range(128)",
2560 starts, size, &startinpos, &endinpos, &exc, &s,
2561 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002564 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002565 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002566 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002567 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002568 Py_XDECREF(errorHandler);
2569 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002570 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002571
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572 onError:
2573 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002574 Py_XDECREF(errorHandler);
2575 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576 return NULL;
2577}
2578
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2580 int size,
2581 const char *errors)
2582{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002583 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584}
2585
2586PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2587{
2588 if (!PyUnicode_Check(unicode)) {
2589 PyErr_BadArgument();
2590 return NULL;
2591 }
2592 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2593 PyUnicode_GET_SIZE(unicode),
2594 NULL);
2595}
2596
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002597#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002598
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002599/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002600
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002601PyObject *PyUnicode_DecodeMBCS(const char *s,
2602 int size,
2603 const char *errors)
2604{
2605 PyUnicodeObject *v;
2606 Py_UNICODE *p;
2607
2608 /* First get the size of the result */
2609 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002610 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002611 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2612
2613 v = _PyUnicode_New(usize);
2614 if (v == NULL)
2615 return NULL;
2616 if (usize == 0)
2617 return (PyObject *)v;
2618 p = PyUnicode_AS_UNICODE(v);
2619 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2620 Py_DECREF(v);
2621 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2622 }
2623
2624 return (PyObject *)v;
2625}
2626
2627PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2628 int size,
2629 const char *errors)
2630{
2631 PyObject *repr;
2632 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002633 DWORD mbcssize;
2634
2635 /* If there are no characters, bail now! */
2636 if (size==0)
2637 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002638
2639 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002640 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002641 if (mbcssize==0)
2642 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2643
2644 repr = PyString_FromStringAndSize(NULL, mbcssize);
2645 if (repr == NULL)
2646 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002647 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002648 return repr;
2649
2650 /* Do the conversion */
2651 s = PyString_AS_STRING(repr);
2652 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2653 Py_DECREF(repr);
2654 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2655 }
2656 return repr;
2657}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002658
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002659PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2660{
2661 if (!PyUnicode_Check(unicode)) {
2662 PyErr_BadArgument();
2663 return NULL;
2664 }
2665 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2666 PyUnicode_GET_SIZE(unicode),
2667 NULL);
2668}
2669
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002670#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002671
Guido van Rossumd57fd912000-03-10 22:53:23 +00002672/* --- Character Mapping Codec -------------------------------------------- */
2673
Guido van Rossumd57fd912000-03-10 22:53:23 +00002674PyObject *PyUnicode_DecodeCharmap(const char *s,
2675 int size,
2676 PyObject *mapping,
2677 const char *errors)
2678{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002679 const char *starts = s;
2680 int startinpos;
2681 int endinpos;
2682 int outpos;
2683 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 PyUnicodeObject *v;
2685 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002686 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002687 PyObject *errorHandler = NULL;
2688 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002689
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 /* Default to Latin-1 */
2691 if (mapping == NULL)
2692 return PyUnicode_DecodeLatin1(s, size, errors);
2693
2694 v = _PyUnicode_New(size);
2695 if (v == NULL)
2696 goto onError;
2697 if (size == 0)
2698 return (PyObject *)v;
2699 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002700 e = s + size;
2701 while (s < e) {
2702 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002703 PyObject *w, *x;
2704
2705 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2706 w = PyInt_FromLong((long)ch);
2707 if (w == NULL)
2708 goto onError;
2709 x = PyObject_GetItem(mapping, w);
2710 Py_DECREF(w);
2711 if (x == NULL) {
2712 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002713 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002715 x = Py_None;
2716 Py_INCREF(x);
2717 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002718 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002719 }
2720
2721 /* Apply mapping */
2722 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002723 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002724 if (value < 0 || value > 65535) {
2725 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002726 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727 Py_DECREF(x);
2728 goto onError;
2729 }
2730 *p++ = (Py_UNICODE)value;
2731 }
2732 else if (x == Py_None) {
2733 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002734 outpos = p-PyUnicode_AS_UNICODE(v);
2735 startinpos = s-starts;
2736 endinpos = startinpos+1;
2737 if (unicode_decode_call_errorhandler(
2738 errors, &errorHandler,
2739 "charmap", "character maps to <undefined>",
2740 starts, size, &startinpos, &endinpos, &exc, &s,
2741 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 Py_DECREF(x);
2743 goto onError;
2744 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002745 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746 }
2747 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002748 int targetsize = PyUnicode_GET_SIZE(x);
2749
2750 if (targetsize == 1)
2751 /* 1-1 mapping */
2752 *p++ = *PyUnicode_AS_UNICODE(x);
2753
2754 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002756 if (targetsize > extrachars) {
2757 /* resize first */
2758 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2759 int needed = (targetsize - extrachars) + \
2760 (targetsize << 2);
2761 extrachars += needed;
Tim Petersced69f82003-09-16 20:30:58 +00002762 if (_PyUnicode_Resize(&v,
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002763 PyUnicode_GET_SIZE(v) + needed) < 0) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002764 Py_DECREF(x);
2765 goto onError;
2766 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002767 p = PyUnicode_AS_UNICODE(v) + oldpos;
2768 }
2769 Py_UNICODE_COPY(p,
2770 PyUnicode_AS_UNICODE(x),
2771 targetsize);
2772 p += targetsize;
2773 extrachars -= targetsize;
2774 }
2775 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776 }
2777 else {
2778 /* wrong return value */
2779 PyErr_SetString(PyExc_TypeError,
2780 "character mapping must return integer, None or unicode");
2781 Py_DECREF(x);
2782 goto onError;
2783 }
2784 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002785 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 }
2787 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002788 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002790 Py_XDECREF(errorHandler);
2791 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002793
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002795 Py_XDECREF(errorHandler);
2796 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797 Py_XDECREF(v);
2798 return NULL;
2799}
2800
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002801/* Lookup the character ch in the mapping. If the character
2802 can't be found, Py_None is returned (or NULL, if another
2803 error occured). */
2804static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002806 PyObject *w = PyInt_FromLong((long)c);
2807 PyObject *x;
2808
2809 if (w == NULL)
2810 return NULL;
2811 x = PyObject_GetItem(mapping, w);
2812 Py_DECREF(w);
2813 if (x == NULL) {
2814 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2815 /* No mapping found means: mapping is undefined. */
2816 PyErr_Clear();
2817 x = Py_None;
2818 Py_INCREF(x);
2819 return x;
2820 } else
2821 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002823 else if (x == Py_None)
2824 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002825 else if (PyInt_Check(x)) {
2826 long value = PyInt_AS_LONG(x);
2827 if (value < 0 || value > 255) {
2828 PyErr_SetString(PyExc_TypeError,
2829 "character mapping must be in range(256)");
2830 Py_DECREF(x);
2831 return NULL;
2832 }
2833 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002835 else if (PyString_Check(x))
2836 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002838 /* wrong return value */
2839 PyErr_SetString(PyExc_TypeError,
2840 "character mapping must return integer, None or str");
2841 Py_DECREF(x);
2842 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 }
2844}
2845
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002846/* lookup the character, put the result in the output string and adjust
2847 various state variables. Reallocate the output string if not enough
2848 space is available. Return a new reference to the object that
2849 was put in the output buffer, or Py_None, if the mapping was undefined
2850 (in which case no character was written) or NULL, if a
2851 reallocation error ocurred. The called must decref the result */
2852static
2853PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2854 PyObject **outobj, int *outpos)
2855{
2856 PyObject *rep = charmapencode_lookup(c, mapping);
2857
2858 if (rep==NULL)
2859 return NULL;
2860 else if (rep==Py_None)
2861 return rep;
2862 else {
2863 char *outstart = PyString_AS_STRING(*outobj);
2864 int outsize = PyString_GET_SIZE(*outobj);
2865 if (PyInt_Check(rep)) {
2866 int requiredsize = *outpos+1;
2867 if (outsize<requiredsize) {
2868 /* exponentially overallocate to minimize reallocations */
2869 if (requiredsize < 2*outsize)
2870 requiredsize = 2*outsize;
2871 if (_PyString_Resize(outobj, requiredsize)) {
2872 Py_DECREF(rep);
2873 return NULL;
2874 }
2875 outstart = PyString_AS_STRING(*outobj);
2876 }
2877 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2878 }
2879 else {
2880 const char *repchars = PyString_AS_STRING(rep);
2881 int repsize = PyString_GET_SIZE(rep);
2882 int requiredsize = *outpos+repsize;
2883 if (outsize<requiredsize) {
2884 /* exponentially overallocate to minimize reallocations */
2885 if (requiredsize < 2*outsize)
2886 requiredsize = 2*outsize;
2887 if (_PyString_Resize(outobj, requiredsize)) {
2888 Py_DECREF(rep);
2889 return NULL;
2890 }
2891 outstart = PyString_AS_STRING(*outobj);
2892 }
2893 memcpy(outstart + *outpos, repchars, repsize);
2894 *outpos += repsize;
2895 }
2896 }
2897 return rep;
2898}
2899
2900/* handle an error in PyUnicode_EncodeCharmap
2901 Return 0 on success, -1 on error */
2902static
2903int charmap_encoding_error(
2904 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2905 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002906 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002907 PyObject **res, int *respos)
2908{
2909 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2910 int repsize;
2911 int newpos;
2912 Py_UNICODE *uni2;
2913 /* startpos for collecting unencodable chars */
2914 int collstartpos = *inpos;
2915 int collendpos = *inpos+1;
2916 int collpos;
2917 char *encoding = "charmap";
2918 char *reason = "character maps to <undefined>";
2919
2920 PyObject *x;
2921 /* find all unencodable characters */
2922 while (collendpos < size) {
2923 x = charmapencode_lookup(p[collendpos], mapping);
2924 if (x==NULL)
2925 return -1;
2926 else if (x!=Py_None) {
2927 Py_DECREF(x);
2928 break;
2929 }
2930 Py_DECREF(x);
2931 ++collendpos;
2932 }
2933 /* cache callback name lookup
2934 * (if not done yet, i.e. it's the first error) */
2935 if (*known_errorHandler==-1) {
2936 if ((errors==NULL) || (!strcmp(errors, "strict")))
2937 *known_errorHandler = 1;
2938 else if (!strcmp(errors, "replace"))
2939 *known_errorHandler = 2;
2940 else if (!strcmp(errors, "ignore"))
2941 *known_errorHandler = 3;
2942 else if (!strcmp(errors, "xmlcharrefreplace"))
2943 *known_errorHandler = 4;
2944 else
2945 *known_errorHandler = 0;
2946 }
2947 switch (*known_errorHandler) {
2948 case 1: /* strict */
2949 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2950 return -1;
2951 case 2: /* replace */
2952 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2953 x = charmapencode_output('?', mapping, res, respos);
2954 if (x==NULL) {
2955 return -1;
2956 }
2957 else if (x==Py_None) {
2958 Py_DECREF(x);
2959 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2960 return -1;
2961 }
2962 Py_DECREF(x);
2963 }
2964 /* fall through */
2965 case 3: /* ignore */
2966 *inpos = collendpos;
2967 break;
2968 case 4: /* xmlcharrefreplace */
2969 /* generate replacement (temporarily (mis)uses p) */
2970 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2971 char buffer[2+29+1+1];
2972 char *cp;
2973 sprintf(buffer, "&#%d;", (int)p[collpos]);
2974 for (cp = buffer; *cp; ++cp) {
2975 x = charmapencode_output(*cp, mapping, res, respos);
2976 if (x==NULL)
2977 return -1;
2978 else if (x==Py_None) {
2979 Py_DECREF(x);
2980 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2981 return -1;
2982 }
2983 Py_DECREF(x);
2984 }
2985 }
2986 *inpos = collendpos;
2987 break;
2988 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002989 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002990 encoding, reason, p, size, exceptionObject,
2991 collstartpos, collendpos, &newpos);
2992 if (repunicode == NULL)
2993 return -1;
2994 /* generate replacement */
2995 repsize = PyUnicode_GET_SIZE(repunicode);
2996 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2997 x = charmapencode_output(*uni2, mapping, res, respos);
2998 if (x==NULL) {
2999 Py_DECREF(repunicode);
3000 return -1;
3001 }
3002 else if (x==Py_None) {
3003 Py_DECREF(repunicode);
3004 Py_DECREF(x);
3005 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3006 return -1;
3007 }
3008 Py_DECREF(x);
3009 }
3010 *inpos = newpos;
3011 Py_DECREF(repunicode);
3012 }
3013 return 0;
3014}
3015
Guido van Rossumd57fd912000-03-10 22:53:23 +00003016PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3017 int size,
3018 PyObject *mapping,
3019 const char *errors)
3020{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003021 /* output object */
3022 PyObject *res = NULL;
3023 /* current input position */
3024 int inpos = 0;
3025 /* current output position */
3026 int respos = 0;
3027 PyObject *errorHandler = NULL;
3028 PyObject *exc = NULL;
3029 /* the following variable is used for caching string comparisons
3030 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3031 * 3=ignore, 4=xmlcharrefreplace */
3032 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033
3034 /* Default to Latin-1 */
3035 if (mapping == NULL)
3036 return PyUnicode_EncodeLatin1(p, size, errors);
3037
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003038 /* allocate enough for a simple encoding without
3039 replacements, if we need more, we'll resize */
3040 res = PyString_FromStringAndSize(NULL, size);
3041 if (res == NULL)
3042 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003043 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003044 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003046 while (inpos<size) {
3047 /* try to encode it */
3048 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3049 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051 if (x==Py_None) { /* unencodable character */
3052 if (charmap_encoding_error(p, size, &inpos, mapping,
3053 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003054 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003055 &res, &respos)) {
3056 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003057 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003060 else
3061 /* done with this character => adjust input position */
3062 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003063 Py_DECREF(x);
3064 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003066 /* Resize if we allocated to much */
3067 if (respos<PyString_GET_SIZE(res)) {
3068 if (_PyString_Resize(&res, respos))
3069 goto onError;
3070 }
3071 Py_XDECREF(exc);
3072 Py_XDECREF(errorHandler);
3073 return res;
3074
3075 onError:
3076 Py_XDECREF(res);
3077 Py_XDECREF(exc);
3078 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079 return NULL;
3080}
3081
3082PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3083 PyObject *mapping)
3084{
3085 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3086 PyErr_BadArgument();
3087 return NULL;
3088 }
3089 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3090 PyUnicode_GET_SIZE(unicode),
3091 mapping,
3092 NULL);
3093}
3094
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003095/* create or adjust a UnicodeTranslateError */
3096static void make_translate_exception(PyObject **exceptionObject,
3097 const Py_UNICODE *unicode, int size,
3098 int startpos, int endpos,
3099 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003100{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003101 if (*exceptionObject == NULL) {
3102 *exceptionObject = PyUnicodeTranslateError_Create(
3103 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104 }
3105 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003106 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3107 goto onError;
3108 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3109 goto onError;
3110 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3111 goto onError;
3112 return;
3113 onError:
3114 Py_DECREF(*exceptionObject);
3115 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116 }
3117}
3118
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003119/* raises a UnicodeTranslateError */
3120static void raise_translate_exception(PyObject **exceptionObject,
3121 const Py_UNICODE *unicode, int size,
3122 int startpos, int endpos,
3123 const char *reason)
3124{
3125 make_translate_exception(exceptionObject,
3126 unicode, size, startpos, endpos, reason);
3127 if (*exceptionObject != NULL)
3128 PyCodec_StrictErrors(*exceptionObject);
3129}
3130
3131/* error handling callback helper:
3132 build arguments, call the callback and check the arguments,
3133 put the result into newpos and return the replacement string, which
3134 has to be freed by the caller */
3135static PyObject *unicode_translate_call_errorhandler(const char *errors,
3136 PyObject **errorHandler,
3137 const char *reason,
3138 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3139 int startpos, int endpos,
3140 int *newpos)
3141{
3142 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3143
3144 PyObject *restuple;
3145 PyObject *resunicode;
3146
3147 if (*errorHandler == NULL) {
3148 *errorHandler = PyCodec_LookupError(errors);
3149 if (*errorHandler == NULL)
3150 return NULL;
3151 }
3152
3153 make_translate_exception(exceptionObject,
3154 unicode, size, startpos, endpos, reason);
3155 if (*exceptionObject == NULL)
3156 return NULL;
3157
3158 restuple = PyObject_CallFunctionObjArgs(
3159 *errorHandler, *exceptionObject, NULL);
3160 if (restuple == NULL)
3161 return NULL;
3162 if (!PyTuple_Check(restuple)) {
3163 PyErr_Format(PyExc_TypeError, &argparse[4]);
3164 Py_DECREF(restuple);
3165 return NULL;
3166 }
3167 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3168 &resunicode, newpos)) {
3169 Py_DECREF(restuple);
3170 return NULL;
3171 }
3172 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003173 *newpos = size+*newpos;
3174 if (*newpos<0 || *newpos>size) {
3175 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3176 Py_DECREF(restuple);
3177 return NULL;
3178 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003179 Py_INCREF(resunicode);
3180 Py_DECREF(restuple);
3181 return resunicode;
3182}
3183
3184/* Lookup the character ch in the mapping and put the result in result,
3185 which must be decrefed by the caller.
3186 Return 0 on success, -1 on error */
3187static
3188int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3189{
3190 PyObject *w = PyInt_FromLong((long)c);
3191 PyObject *x;
3192
3193 if (w == NULL)
3194 return -1;
3195 x = PyObject_GetItem(mapping, w);
3196 Py_DECREF(w);
3197 if (x == NULL) {
3198 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3199 /* No mapping found means: use 1:1 mapping. */
3200 PyErr_Clear();
3201 *result = NULL;
3202 return 0;
3203 } else
3204 return -1;
3205 }
3206 else if (x == Py_None) {
3207 *result = x;
3208 return 0;
3209 }
3210 else if (PyInt_Check(x)) {
3211 long value = PyInt_AS_LONG(x);
3212 long max = PyUnicode_GetMax();
3213 if (value < 0 || value > max) {
3214 PyErr_Format(PyExc_TypeError,
3215 "character mapping must be in range(0x%lx)", max+1);
3216 Py_DECREF(x);
3217 return -1;
3218 }
3219 *result = x;
3220 return 0;
3221 }
3222 else if (PyUnicode_Check(x)) {
3223 *result = x;
3224 return 0;
3225 }
3226 else {
3227 /* wrong return value */
3228 PyErr_SetString(PyExc_TypeError,
3229 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003230 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003231 return -1;
3232 }
3233}
3234/* ensure that *outobj is at least requiredsize characters long,
3235if not reallocate and adjust various state variables.
3236Return 0 on success, -1 on error */
3237static
Walter Dörwald4894c302003-10-24 14:25:28 +00003238int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003239 int requiredsize)
3240{
Walter Dörwald4894c302003-10-24 14:25:28 +00003241 int oldsize = PyUnicode_GET_SIZE(*outobj);
3242 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003243 /* remember old output position */
3244 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3245 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003246 if (requiredsize < 2 * oldsize)
3247 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003248 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003249 return -1;
3250 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003251 }
3252 return 0;
3253}
3254/* lookup the character, put the result in the output string and adjust
3255 various state variables. Return a new reference to the object that
3256 was put in the output buffer in *result, or Py_None, if the mapping was
3257 undefined (in which case no character was written).
3258 The called must decref result.
3259 Return 0 on success, -1 on error. */
3260static
Walter Dörwald4894c302003-10-24 14:25:28 +00003261int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3262 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3263 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003264{
Walter Dörwald4894c302003-10-24 14:25:28 +00003265 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003266 return -1;
3267 if (*res==NULL) {
3268 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003269 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003270 }
3271 else if (*res==Py_None)
3272 ;
3273 else if (PyInt_Check(*res)) {
3274 /* no overflow check, because we know that the space is enough */
3275 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3276 }
3277 else if (PyUnicode_Check(*res)) {
3278 int repsize = PyUnicode_GET_SIZE(*res);
3279 if (repsize==1) {
3280 /* no overflow check, because we know that the space is enough */
3281 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3282 }
3283 else if (repsize!=0) {
3284 /* more than one character */
Walter Dörwald4894c302003-10-24 14:25:28 +00003285 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003286 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003287 repsize - 1;
3288 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003289 return -1;
3290 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3291 *outp += repsize;
3292 }
3293 }
3294 else
3295 return -1;
3296 return 0;
3297}
3298
3299PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300 int size,
3301 PyObject *mapping,
3302 const char *errors)
3303{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003304 /* output object */
3305 PyObject *res = NULL;
3306 /* pointers to the beginning and end+1 of input */
3307 const Py_UNICODE *startp = p;
3308 const Py_UNICODE *endp = p + size;
3309 /* pointer into the output */
3310 Py_UNICODE *str;
3311 /* current output position */
3312 int respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003313 char *reason = "character maps to <undefined>";
3314 PyObject *errorHandler = NULL;
3315 PyObject *exc = NULL;
3316 /* the following variable is used for caching string comparisons
3317 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3318 * 3=ignore, 4=xmlcharrefreplace */
3319 int known_errorHandler = -1;
3320
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321 if (mapping == NULL) {
3322 PyErr_BadArgument();
3323 return NULL;
3324 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003325
3326 /* allocate enough for a simple 1:1 translation without
3327 replacements, if we need more, we'll resize */
3328 res = PyUnicode_FromUnicode(NULL, size);
3329 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003330 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003331 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003332 return res;
3333 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003334
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003335 while (p<endp) {
3336 /* try to encode it */
3337 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003338 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003339 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340 goto onError;
3341 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003342 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343 if (x!=Py_None) /* it worked => adjust input pointer */
3344 ++p;
3345 else { /* untranslatable character */
3346 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3347 int repsize;
3348 int newpos;
3349 Py_UNICODE *uni2;
3350 /* startpos for collecting untranslatable chars */
3351 const Py_UNICODE *collstart = p;
3352 const Py_UNICODE *collend = p+1;
3353 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003354
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003355 /* find all untranslatable characters */
3356 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003357 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003358 goto onError;
3359 Py_XDECREF(x);
3360 if (x!=Py_None)
3361 break;
3362 ++collend;
3363 }
3364 /* cache callback name lookup
3365 * (if not done yet, i.e. it's the first error) */
3366 if (known_errorHandler==-1) {
3367 if ((errors==NULL) || (!strcmp(errors, "strict")))
3368 known_errorHandler = 1;
3369 else if (!strcmp(errors, "replace"))
3370 known_errorHandler = 2;
3371 else if (!strcmp(errors, "ignore"))
3372 known_errorHandler = 3;
3373 else if (!strcmp(errors, "xmlcharrefreplace"))
3374 known_errorHandler = 4;
3375 else
3376 known_errorHandler = 0;
3377 }
3378 switch (known_errorHandler) {
3379 case 1: /* strict */
3380 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3381 goto onError;
3382 case 2: /* replace */
3383 /* No need to check for space, this is a 1:1 replacement */
3384 for (coll = collstart; coll<collend; ++coll)
3385 *str++ = '?';
3386 /* fall through */
3387 case 3: /* ignore */
3388 p = collend;
3389 break;
3390 case 4: /* xmlcharrefreplace */
3391 /* generate replacement (temporarily (mis)uses p) */
3392 for (p = collstart; p < collend; ++p) {
3393 char buffer[2+29+1+1];
3394 char *cp;
3395 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003396 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003397 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3398 goto onError;
3399 for (cp = buffer; *cp; ++cp)
3400 *str++ = *cp;
3401 }
3402 p = collend;
3403 break;
3404 default:
3405 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3406 reason, startp, size, &exc,
3407 collstart-startp, collend-startp, &newpos);
3408 if (repunicode == NULL)
3409 goto onError;
3410 /* generate replacement */
3411 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003412 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003413 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3414 Py_DECREF(repunicode);
3415 goto onError;
3416 }
3417 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3418 *str++ = *uni2;
3419 p = startp + newpos;
3420 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003421 }
3422 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003423 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003424 /* Resize if we allocated to much */
3425 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003426 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003427 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003428 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429 }
3430 Py_XDECREF(exc);
3431 Py_XDECREF(errorHandler);
3432 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003433
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003434 onError:
3435 Py_XDECREF(res);
3436 Py_XDECREF(exc);
3437 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003438 return NULL;
3439}
3440
3441PyObject *PyUnicode_Translate(PyObject *str,
3442 PyObject *mapping,
3443 const char *errors)
3444{
3445 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003446
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447 str = PyUnicode_FromObject(str);
3448 if (str == NULL)
3449 goto onError;
3450 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3451 PyUnicode_GET_SIZE(str),
3452 mapping,
3453 errors);
3454 Py_DECREF(str);
3455 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003456
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457 onError:
3458 Py_XDECREF(str);
3459 return NULL;
3460}
Tim Petersced69f82003-09-16 20:30:58 +00003461
Guido van Rossum9e896b32000-04-05 20:11:21 +00003462/* --- Decimal Encoder ---------------------------------------------------- */
3463
3464int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3465 int length,
3466 char *output,
3467 const char *errors)
3468{
3469 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003470 PyObject *errorHandler = NULL;
3471 PyObject *exc = NULL;
3472 const char *encoding = "decimal";
3473 const char *reason = "invalid decimal Unicode string";
3474 /* the following variable is used for caching string comparisons
3475 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3476 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003477
3478 if (output == NULL) {
3479 PyErr_BadArgument();
3480 return -1;
3481 }
3482
3483 p = s;
3484 end = s + length;
3485 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003486 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003487 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003488 PyObject *repunicode;
3489 int repsize;
3490 int newpos;
3491 Py_UNICODE *uni2;
3492 Py_UNICODE *collstart;
3493 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003494
Guido van Rossum9e896b32000-04-05 20:11:21 +00003495 if (Py_UNICODE_ISSPACE(ch)) {
3496 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003498 continue;
3499 }
3500 decimal = Py_UNICODE_TODECIMAL(ch);
3501 if (decimal >= 0) {
3502 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003504 continue;
3505 }
Guido van Rossumba477042000-04-06 18:18:10 +00003506 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003507 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003508 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003509 continue;
3510 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003511 /* All other characters are considered unencodable */
3512 collstart = p;
3513 collend = p+1;
3514 while (collend < end) {
3515 if ((0 < *collend && *collend < 256) ||
3516 !Py_UNICODE_ISSPACE(*collend) ||
3517 Py_UNICODE_TODECIMAL(*collend))
3518 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003519 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520 /* cache callback name lookup
3521 * (if not done yet, i.e. it's the first error) */
3522 if (known_errorHandler==-1) {
3523 if ((errors==NULL) || (!strcmp(errors, "strict")))
3524 known_errorHandler = 1;
3525 else if (!strcmp(errors, "replace"))
3526 known_errorHandler = 2;
3527 else if (!strcmp(errors, "ignore"))
3528 known_errorHandler = 3;
3529 else if (!strcmp(errors, "xmlcharrefreplace"))
3530 known_errorHandler = 4;
3531 else
3532 known_errorHandler = 0;
3533 }
3534 switch (known_errorHandler) {
3535 case 1: /* strict */
3536 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3537 goto onError;
3538 case 2: /* replace */
3539 for (p = collstart; p < collend; ++p)
3540 *output++ = '?';
3541 /* fall through */
3542 case 3: /* ignore */
3543 p = collend;
3544 break;
3545 case 4: /* xmlcharrefreplace */
3546 /* generate replacement (temporarily (mis)uses p) */
3547 for (p = collstart; p < collend; ++p)
3548 output += sprintf(output, "&#%d;", (int)*p);
3549 p = collend;
3550 break;
3551 default:
3552 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3553 encoding, reason, s, length, &exc,
3554 collstart-s, collend-s, &newpos);
3555 if (repunicode == NULL)
3556 goto onError;
3557 /* generate replacement */
3558 repsize = PyUnicode_GET_SIZE(repunicode);
3559 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3560 Py_UNICODE ch = *uni2;
3561 if (Py_UNICODE_ISSPACE(ch))
3562 *output++ = ' ';
3563 else {
3564 decimal = Py_UNICODE_TODECIMAL(ch);
3565 if (decimal >= 0)
3566 *output++ = '0' + decimal;
3567 else if (0 < ch && ch < 256)
3568 *output++ = (char)ch;
3569 else {
3570 Py_DECREF(repunicode);
3571 raise_encode_exception(&exc, encoding,
3572 s, length, collstart-s, collend-s, reason);
3573 goto onError;
3574 }
3575 }
3576 }
3577 p = s + newpos;
3578 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003579 }
3580 }
3581 /* 0-terminate the output string */
3582 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003583 Py_XDECREF(exc);
3584 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003585 return 0;
3586
3587 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 Py_XDECREF(exc);
3589 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003590 return -1;
3591}
3592
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593/* --- Helpers ------------------------------------------------------------ */
3594
Tim Petersced69f82003-09-16 20:30:58 +00003595static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596int count(PyUnicodeObject *self,
3597 int start,
3598 int end,
3599 PyUnicodeObject *substring)
3600{
3601 int count = 0;
3602
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003603 if (start < 0)
3604 start += self->length;
3605 if (start < 0)
3606 start = 0;
3607 if (end > self->length)
3608 end = self->length;
3609 if (end < 0)
3610 end += self->length;
3611 if (end < 0)
3612 end = 0;
3613
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003614 if (substring->length == 0)
3615 return (end - start + 1);
3616
Guido van Rossumd57fd912000-03-10 22:53:23 +00003617 end -= substring->length;
3618
3619 while (start <= end)
3620 if (Py_UNICODE_MATCH(self, start, substring)) {
3621 count++;
3622 start += substring->length;
3623 } else
3624 start++;
3625
3626 return count;
3627}
3628
3629int PyUnicode_Count(PyObject *str,
3630 PyObject *substr,
3631 int start,
3632 int end)
3633{
3634 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003635
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636 str = PyUnicode_FromObject(str);
3637 if (str == NULL)
3638 return -1;
3639 substr = PyUnicode_FromObject(substr);
3640 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003641 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642 return -1;
3643 }
Tim Petersced69f82003-09-16 20:30:58 +00003644
Guido van Rossumd57fd912000-03-10 22:53:23 +00003645 result = count((PyUnicodeObject *)str,
3646 start, end,
3647 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003648
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649 Py_DECREF(str);
3650 Py_DECREF(substr);
3651 return result;
3652}
3653
Tim Petersced69f82003-09-16 20:30:58 +00003654static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655int findstring(PyUnicodeObject *self,
3656 PyUnicodeObject *substring,
3657 int start,
3658 int end,
3659 int direction)
3660{
3661 if (start < 0)
3662 start += self->length;
3663 if (start < 0)
3664 start = 0;
3665
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666 if (end > self->length)
3667 end = self->length;
3668 if (end < 0)
3669 end += self->length;
3670 if (end < 0)
3671 end = 0;
3672
Guido van Rossum76afbd92002-08-20 17:29:29 +00003673 if (substring->length == 0)
3674 return (direction > 0) ? start : end;
3675
Guido van Rossumd57fd912000-03-10 22:53:23 +00003676 end -= substring->length;
3677
3678 if (direction < 0) {
3679 for (; end >= start; end--)
3680 if (Py_UNICODE_MATCH(self, end, substring))
3681 return end;
3682 } else {
3683 for (; start <= end; start++)
3684 if (Py_UNICODE_MATCH(self, start, substring))
3685 return start;
3686 }
3687
3688 return -1;
3689}
3690
3691int PyUnicode_Find(PyObject *str,
3692 PyObject *substr,
3693 int start,
3694 int end,
3695 int direction)
3696{
3697 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003698
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699 str = PyUnicode_FromObject(str);
3700 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003701 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702 substr = PyUnicode_FromObject(substr);
3703 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003704 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003705 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706 }
Tim Petersced69f82003-09-16 20:30:58 +00003707
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 result = findstring((PyUnicodeObject *)str,
3709 (PyUnicodeObject *)substr,
3710 start, end, direction);
3711 Py_DECREF(str);
3712 Py_DECREF(substr);
3713 return result;
3714}
3715
Tim Petersced69f82003-09-16 20:30:58 +00003716static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717int tailmatch(PyUnicodeObject *self,
3718 PyUnicodeObject *substring,
3719 int start,
3720 int end,
3721 int direction)
3722{
3723 if (start < 0)
3724 start += self->length;
3725 if (start < 0)
3726 start = 0;
3727
3728 if (substring->length == 0)
3729 return 1;
3730
3731 if (end > self->length)
3732 end = self->length;
3733 if (end < 0)
3734 end += self->length;
3735 if (end < 0)
3736 end = 0;
3737
3738 end -= substring->length;
3739 if (end < start)
3740 return 0;
3741
3742 if (direction > 0) {
3743 if (Py_UNICODE_MATCH(self, end, substring))
3744 return 1;
3745 } else {
3746 if (Py_UNICODE_MATCH(self, start, substring))
3747 return 1;
3748 }
3749
3750 return 0;
3751}
3752
3753int PyUnicode_Tailmatch(PyObject *str,
3754 PyObject *substr,
3755 int start,
3756 int end,
3757 int direction)
3758{
3759 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003760
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761 str = PyUnicode_FromObject(str);
3762 if (str == NULL)
3763 return -1;
3764 substr = PyUnicode_FromObject(substr);
3765 if (substr == NULL) {
3766 Py_DECREF(substr);
3767 return -1;
3768 }
Tim Petersced69f82003-09-16 20:30:58 +00003769
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770 result = tailmatch((PyUnicodeObject *)str,
3771 (PyUnicodeObject *)substr,
3772 start, end, direction);
3773 Py_DECREF(str);
3774 Py_DECREF(substr);
3775 return result;
3776}
3777
Tim Petersced69f82003-09-16 20:30:58 +00003778static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779const Py_UNICODE *findchar(const Py_UNICODE *s,
3780 int size,
3781 Py_UNICODE ch)
3782{
3783 /* like wcschr, but doesn't stop at NULL characters */
3784
3785 while (size-- > 0) {
3786 if (*s == ch)
3787 return s;
3788 s++;
3789 }
3790
3791 return NULL;
3792}
3793
3794/* Apply fixfct filter to the Unicode object self and return a
3795 reference to the modified object */
3796
Tim Petersced69f82003-09-16 20:30:58 +00003797static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798PyObject *fixup(PyUnicodeObject *self,
3799 int (*fixfct)(PyUnicodeObject *s))
3800{
3801
3802 PyUnicodeObject *u;
3803
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003804 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805 if (u == NULL)
3806 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003807
3808 Py_UNICODE_COPY(u->str, self->str, self->length);
3809
Tim Peters7a29bd52001-09-12 03:03:31 +00003810 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811 /* fixfct should return TRUE if it modified the buffer. If
3812 FALSE, return a reference to the original buffer instead
3813 (to save space, not time) */
3814 Py_INCREF(self);
3815 Py_DECREF(u);
3816 return (PyObject*) self;
3817 }
3818 return (PyObject*) u;
3819}
3820
Tim Petersced69f82003-09-16 20:30:58 +00003821static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822int fixupper(PyUnicodeObject *self)
3823{
3824 int len = self->length;
3825 Py_UNICODE *s = self->str;
3826 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003827
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828 while (len-- > 0) {
3829 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003830
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831 ch = Py_UNICODE_TOUPPER(*s);
3832 if (ch != *s) {
3833 status = 1;
3834 *s = ch;
3835 }
3836 s++;
3837 }
3838
3839 return status;
3840}
3841
Tim Petersced69f82003-09-16 20:30:58 +00003842static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843int fixlower(PyUnicodeObject *self)
3844{
3845 int len = self->length;
3846 Py_UNICODE *s = self->str;
3847 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003848
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849 while (len-- > 0) {
3850 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003851
Guido van Rossumd57fd912000-03-10 22:53:23 +00003852 ch = Py_UNICODE_TOLOWER(*s);
3853 if (ch != *s) {
3854 status = 1;
3855 *s = ch;
3856 }
3857 s++;
3858 }
3859
3860 return status;
3861}
3862
Tim Petersced69f82003-09-16 20:30:58 +00003863static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864int fixswapcase(PyUnicodeObject *self)
3865{
3866 int len = self->length;
3867 Py_UNICODE *s = self->str;
3868 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003869
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870 while (len-- > 0) {
3871 if (Py_UNICODE_ISUPPER(*s)) {
3872 *s = Py_UNICODE_TOLOWER(*s);
3873 status = 1;
3874 } else if (Py_UNICODE_ISLOWER(*s)) {
3875 *s = Py_UNICODE_TOUPPER(*s);
3876 status = 1;
3877 }
3878 s++;
3879 }
3880
3881 return status;
3882}
3883
Tim Petersced69f82003-09-16 20:30:58 +00003884static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003885int fixcapitalize(PyUnicodeObject *self)
3886{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003887 int len = self->length;
3888 Py_UNICODE *s = self->str;
3889 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003890
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003891 if (len == 0)
3892 return 0;
3893 if (Py_UNICODE_ISLOWER(*s)) {
3894 *s = Py_UNICODE_TOUPPER(*s);
3895 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003897 s++;
3898 while (--len > 0) {
3899 if (Py_UNICODE_ISUPPER(*s)) {
3900 *s = Py_UNICODE_TOLOWER(*s);
3901 status = 1;
3902 }
3903 s++;
3904 }
3905 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906}
3907
3908static
3909int fixtitle(PyUnicodeObject *self)
3910{
3911 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3912 register Py_UNICODE *e;
3913 int previous_is_cased;
3914
3915 /* Shortcut for single character strings */
3916 if (PyUnicode_GET_SIZE(self) == 1) {
3917 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3918 if (*p != ch) {
3919 *p = ch;
3920 return 1;
3921 }
3922 else
3923 return 0;
3924 }
Tim Petersced69f82003-09-16 20:30:58 +00003925
Guido van Rossumd57fd912000-03-10 22:53:23 +00003926 e = p + PyUnicode_GET_SIZE(self);
3927 previous_is_cased = 0;
3928 for (; p < e; p++) {
3929 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00003930
Guido van Rossumd57fd912000-03-10 22:53:23 +00003931 if (previous_is_cased)
3932 *p = Py_UNICODE_TOLOWER(ch);
3933 else
3934 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00003935
3936 if (Py_UNICODE_ISLOWER(ch) ||
3937 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938 Py_UNICODE_ISTITLE(ch))
3939 previous_is_cased = 1;
3940 else
3941 previous_is_cased = 0;
3942 }
3943 return 1;
3944}
3945
3946PyObject *PyUnicode_Join(PyObject *separator,
3947 PyObject *seq)
3948{
3949 Py_UNICODE *sep;
3950 int seplen;
3951 PyUnicodeObject *res = NULL;
3952 int reslen = 0;
3953 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003954 int sz = 100;
3955 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003956 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957
Tim Peters2cfe3682001-05-05 05:36:48 +00003958 it = PyObject_GetIter(seq);
3959 if (it == NULL)
3960 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961
3962 if (separator == NULL) {
3963 Py_UNICODE blank = ' ';
3964 sep = &blank;
3965 seplen = 1;
3966 }
3967 else {
3968 separator = PyUnicode_FromObject(separator);
3969 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003970 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971 sep = PyUnicode_AS_UNICODE(separator);
3972 seplen = PyUnicode_GET_SIZE(separator);
3973 }
Tim Petersced69f82003-09-16 20:30:58 +00003974
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 res = _PyUnicode_New(sz);
3976 if (res == NULL)
3977 goto onError;
3978 p = PyUnicode_AS_UNICODE(res);
3979 reslen = 0;
3980
Tim Peters2cfe3682001-05-05 05:36:48 +00003981 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003983 PyObject *item = PyIter_Next(it);
3984 if (item == NULL) {
3985 if (PyErr_Occurred())
3986 goto onError;
3987 break;
3988 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989 if (!PyUnicode_Check(item)) {
3990 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003991 if (!PyString_Check(item)) {
3992 PyErr_Format(PyExc_TypeError,
3993 "sequence item %i: expected string or Unicode,"
3994 " %.80s found",
3995 i, item->ob_type->tp_name);
3996 Py_DECREF(item);
3997 goto onError;
3998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003999 v = PyUnicode_FromObject(item);
4000 Py_DECREF(item);
4001 item = v;
4002 if (item == NULL)
4003 goto onError;
4004 }
4005 itemlen = PyUnicode_GET_SIZE(item);
4006 while (reslen + itemlen + seplen >= sz) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004007 if (_PyUnicode_Resize(&res, sz*2) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004008 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004010 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011 sz *= 2;
4012 p = PyUnicode_AS_UNICODE(res) + reslen;
4013 }
4014 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004015 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016 p += seplen;
4017 reslen += seplen;
4018 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004019 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004020 p += itemlen;
4021 reslen += itemlen;
4022 Py_DECREF(item);
4023 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004024 if (_PyUnicode_Resize(&res, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004025 goto onError;
4026
4027 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004028 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004029 return (PyObject *)res;
4030
4031 onError:
4032 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004033 Py_XDECREF(res);
4034 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035 return NULL;
4036}
4037
Tim Petersced69f82003-09-16 20:30:58 +00004038static
4039PyUnicodeObject *pad(PyUnicodeObject *self,
4040 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004041 int right,
4042 Py_UNICODE fill)
4043{
4044 PyUnicodeObject *u;
4045
4046 if (left < 0)
4047 left = 0;
4048 if (right < 0)
4049 right = 0;
4050
Tim Peters7a29bd52001-09-12 03:03:31 +00004051 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052 Py_INCREF(self);
4053 return self;
4054 }
4055
4056 u = _PyUnicode_New(left + self->length + right);
4057 if (u) {
4058 if (left)
4059 Py_UNICODE_FILL(u->str, fill, left);
4060 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4061 if (right)
4062 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4063 }
4064
4065 return u;
4066}
4067
4068#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004069 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 if (!str) \
4071 goto onError; \
4072 if (PyList_Append(list, str)) { \
4073 Py_DECREF(str); \
4074 goto onError; \
4075 } \
4076 else \
4077 Py_DECREF(str);
4078
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004079#define SPLIT_INSERT(data, left, right) \
4080 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4081 if (!str) \
4082 goto onError; \
4083 if (PyList_Insert(list, 0, str)) { \
4084 Py_DECREF(str); \
4085 goto onError; \
4086 } \
4087 else \
4088 Py_DECREF(str);
4089
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090static
4091PyObject *split_whitespace(PyUnicodeObject *self,
4092 PyObject *list,
4093 int maxcount)
4094{
4095 register int i;
4096 register int j;
4097 int len = self->length;
4098 PyObject *str;
4099
4100 for (i = j = 0; i < len; ) {
4101 /* find a token */
4102 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4103 i++;
4104 j = i;
4105 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4106 i++;
4107 if (j < i) {
4108 if (maxcount-- <= 0)
4109 break;
4110 SPLIT_APPEND(self->str, j, i);
4111 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4112 i++;
4113 j = i;
4114 }
4115 }
4116 if (j < len) {
4117 SPLIT_APPEND(self->str, j, len);
4118 }
4119 return list;
4120
4121 onError:
4122 Py_DECREF(list);
4123 return NULL;
4124}
4125
4126PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004127 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128{
4129 register int i;
4130 register int j;
4131 int len;
4132 PyObject *list;
4133 PyObject *str;
4134 Py_UNICODE *data;
4135
4136 string = PyUnicode_FromObject(string);
4137 if (string == NULL)
4138 return NULL;
4139 data = PyUnicode_AS_UNICODE(string);
4140 len = PyUnicode_GET_SIZE(string);
4141
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142 list = PyList_New(0);
4143 if (!list)
4144 goto onError;
4145
4146 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004147 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004148
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149 /* Find a line and append it */
4150 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4151 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152
4153 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004154 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155 if (i < len) {
4156 if (data[i] == '\r' && i + 1 < len &&
4157 data[i+1] == '\n')
4158 i += 2;
4159 else
4160 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004161 if (keepends)
4162 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004163 }
Guido van Rossum86662912000-04-11 15:38:46 +00004164 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165 j = i;
4166 }
4167 if (j < len) {
4168 SPLIT_APPEND(data, j, len);
4169 }
4170
4171 Py_DECREF(string);
4172 return list;
4173
4174 onError:
4175 Py_DECREF(list);
4176 Py_DECREF(string);
4177 return NULL;
4178}
4179
Tim Petersced69f82003-09-16 20:30:58 +00004180static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181PyObject *split_char(PyUnicodeObject *self,
4182 PyObject *list,
4183 Py_UNICODE ch,
4184 int maxcount)
4185{
4186 register int i;
4187 register int j;
4188 int len = self->length;
4189 PyObject *str;
4190
4191 for (i = j = 0; i < len; ) {
4192 if (self->str[i] == ch) {
4193 if (maxcount-- <= 0)
4194 break;
4195 SPLIT_APPEND(self->str, j, i);
4196 i = j = i + 1;
4197 } else
4198 i++;
4199 }
4200 if (j <= len) {
4201 SPLIT_APPEND(self->str, j, len);
4202 }
4203 return list;
4204
4205 onError:
4206 Py_DECREF(list);
4207 return NULL;
4208}
4209
Tim Petersced69f82003-09-16 20:30:58 +00004210static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004211PyObject *split_substring(PyUnicodeObject *self,
4212 PyObject *list,
4213 PyUnicodeObject *substring,
4214 int maxcount)
4215{
4216 register int i;
4217 register int j;
4218 int len = self->length;
4219 int sublen = substring->length;
4220 PyObject *str;
4221
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004222 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004223 if (Py_UNICODE_MATCH(self, i, substring)) {
4224 if (maxcount-- <= 0)
4225 break;
4226 SPLIT_APPEND(self->str, j, i);
4227 i = j = i + sublen;
4228 } else
4229 i++;
4230 }
4231 if (j <= len) {
4232 SPLIT_APPEND(self->str, j, len);
4233 }
4234 return list;
4235
4236 onError:
4237 Py_DECREF(list);
4238 return NULL;
4239}
4240
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004241static
4242PyObject *rsplit_whitespace(PyUnicodeObject *self,
4243 PyObject *list,
4244 int maxcount)
4245{
4246 register int i;
4247 register int j;
4248 int len = self->length;
4249 PyObject *str;
4250
4251 for (i = j = len - 1; i >= 0; ) {
4252 /* find a token */
4253 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4254 i--;
4255 j = i;
4256 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4257 i--;
4258 if (j > i) {
4259 if (maxcount-- <= 0)
4260 break;
4261 SPLIT_INSERT(self->str, i + 1, j + 1);
4262 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4263 i--;
4264 j = i;
4265 }
4266 }
4267 if (j >= 0) {
4268 SPLIT_INSERT(self->str, 0, j + 1);
4269 }
4270 return list;
4271
4272 onError:
4273 Py_DECREF(list);
4274 return NULL;
4275}
4276
4277static
4278PyObject *rsplit_char(PyUnicodeObject *self,
4279 PyObject *list,
4280 Py_UNICODE ch,
4281 int maxcount)
4282{
4283 register int i;
4284 register int j;
4285 int len = self->length;
4286 PyObject *str;
4287
4288 for (i = j = len - 1; i >= 0; ) {
4289 if (self->str[i] == ch) {
4290 if (maxcount-- <= 0)
4291 break;
4292 SPLIT_INSERT(self->str, i + 1, j + 1);
4293 j = i = i - 1;
4294 } else
4295 i--;
4296 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004297 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004298 SPLIT_INSERT(self->str, 0, j + 1);
4299 }
4300 return list;
4301
4302 onError:
4303 Py_DECREF(list);
4304 return NULL;
4305}
4306
4307static
4308PyObject *rsplit_substring(PyUnicodeObject *self,
4309 PyObject *list,
4310 PyUnicodeObject *substring,
4311 int maxcount)
4312{
4313 register int i;
4314 register int j;
4315 int len = self->length;
4316 int sublen = substring->length;
4317 PyObject *str;
4318
4319 for (i = len - sublen, j = len; i >= 0; ) {
4320 if (Py_UNICODE_MATCH(self, i, substring)) {
4321 if (maxcount-- <= 0)
4322 break;
4323 SPLIT_INSERT(self->str, i + sublen, j);
4324 j = i;
4325 i -= sublen;
4326 } else
4327 i--;
4328 }
4329 if (j >= 0) {
4330 SPLIT_INSERT(self->str, 0, j);
4331 }
4332 return list;
4333
4334 onError:
4335 Py_DECREF(list);
4336 return NULL;
4337}
4338
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004340#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341
4342static
4343PyObject *split(PyUnicodeObject *self,
4344 PyUnicodeObject *substring,
4345 int maxcount)
4346{
4347 PyObject *list;
4348
4349 if (maxcount < 0)
4350 maxcount = INT_MAX;
4351
4352 list = PyList_New(0);
4353 if (!list)
4354 return NULL;
4355
4356 if (substring == NULL)
4357 return split_whitespace(self,list,maxcount);
4358
4359 else if (substring->length == 1)
4360 return split_char(self,list,substring->str[0],maxcount);
4361
4362 else if (substring->length == 0) {
4363 Py_DECREF(list);
4364 PyErr_SetString(PyExc_ValueError, "empty separator");
4365 return NULL;
4366 }
4367 else
4368 return split_substring(self,list,substring,maxcount);
4369}
4370
Tim Petersced69f82003-09-16 20:30:58 +00004371static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004372PyObject *rsplit(PyUnicodeObject *self,
4373 PyUnicodeObject *substring,
4374 int maxcount)
4375{
4376 PyObject *list;
4377
4378 if (maxcount < 0)
4379 maxcount = INT_MAX;
4380
4381 list = PyList_New(0);
4382 if (!list)
4383 return NULL;
4384
4385 if (substring == NULL)
4386 return rsplit_whitespace(self,list,maxcount);
4387
4388 else if (substring->length == 1)
4389 return rsplit_char(self,list,substring->str[0],maxcount);
4390
4391 else if (substring->length == 0) {
4392 Py_DECREF(list);
4393 PyErr_SetString(PyExc_ValueError, "empty separator");
4394 return NULL;
4395 }
4396 else
4397 return rsplit_substring(self,list,substring,maxcount);
4398}
4399
4400static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004401PyObject *replace(PyUnicodeObject *self,
4402 PyUnicodeObject *str1,
4403 PyUnicodeObject *str2,
4404 int maxcount)
4405{
4406 PyUnicodeObject *u;
4407
4408 if (maxcount < 0)
4409 maxcount = INT_MAX;
4410
4411 if (str1->length == 1 && str2->length == 1) {
4412 int i;
4413
4414 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004415 if (!findchar(self->str, self->length, str1->str[0]) &&
4416 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417 /* nothing to replace, return original string */
4418 Py_INCREF(self);
4419 u = self;
4420 } else {
4421 Py_UNICODE u1 = str1->str[0];
4422 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004423
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004425 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426 self->length
4427 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004428 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004429 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004430 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431 for (i = 0; i < u->length; i++)
4432 if (u->str[i] == u1) {
4433 if (--maxcount < 0)
4434 break;
4435 u->str[i] = u2;
4436 }
4437 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004438 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439
4440 } else {
4441 int n, i;
4442 Py_UNICODE *p;
4443
4444 /* replace strings */
4445 n = count(self, 0, self->length, str1);
4446 if (n > maxcount)
4447 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004448 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004450 if (PyUnicode_CheckExact(self)) {
4451 Py_INCREF(self);
4452 u = self;
4453 }
4454 else {
4455 u = (PyUnicodeObject *)
4456 PyUnicode_FromUnicode(self->str, self->length);
4457 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 } else {
4459 u = _PyUnicode_New(
4460 self->length + n * (str2->length - str1->length));
4461 if (u) {
4462 i = 0;
4463 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004464 if (str1->length > 0) {
4465 while (i <= self->length - str1->length)
4466 if (Py_UNICODE_MATCH(self, i, str1)) {
4467 /* replace string segment */
4468 Py_UNICODE_COPY(p, str2->str, str2->length);
4469 p += str2->length;
4470 i += str1->length;
4471 if (--n <= 0) {
4472 /* copy remaining part */
4473 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4474 break;
4475 }
4476 } else
4477 *p++ = self->str[i++];
4478 } else {
4479 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 Py_UNICODE_COPY(p, str2->str, str2->length);
4481 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004482 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004483 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004485 }
4486 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4487 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488 }
4489 }
4490 }
Tim Petersced69f82003-09-16 20:30:58 +00004491
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492 return (PyObject *) u;
4493}
4494
4495/* --- Unicode Object Methods --------------------------------------------- */
4496
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004497PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498"S.title() -> unicode\n\
4499\n\
4500Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004501characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502
4503static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004504unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004505{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506 return fixup(self, fixtitle);
4507}
4508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004509PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510"S.capitalize() -> unicode\n\
4511\n\
4512Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004513have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004514
4515static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004516unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 return fixup(self, fixcapitalize);
4519}
4520
4521#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004522PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004523"S.capwords() -> unicode\n\
4524\n\
4525Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004526normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004527
4528static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004529unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530{
4531 PyObject *list;
4532 PyObject *item;
4533 int i;
4534
Guido van Rossumd57fd912000-03-10 22:53:23 +00004535 /* Split into words */
4536 list = split(self, NULL, -1);
4537 if (!list)
4538 return NULL;
4539
4540 /* Capitalize each word */
4541 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4542 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4543 fixcapitalize);
4544 if (item == NULL)
4545 goto onError;
4546 Py_DECREF(PyList_GET_ITEM(list, i));
4547 PyList_SET_ITEM(list, i, item);
4548 }
4549
4550 /* Join the words to form a new string */
4551 item = PyUnicode_Join(NULL, list);
4552
4553onError:
4554 Py_DECREF(list);
4555 return (PyObject *)item;
4556}
4557#endif
4558
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004559/* Argument converter. Coerces to a single unicode character */
4560
4561static int
4562convert_uc(PyObject *obj, void *addr)
4563{
4564 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4565 PyObject *uniobj;
4566 Py_UNICODE *unistr;
4567
4568 uniobj = PyUnicode_FromObject(obj);
4569 if (uniobj == NULL) {
4570 PyErr_SetString(PyExc_TypeError,
4571 "The fill character cannot be converted to Unicode");
4572 return 0;
4573 }
4574 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4575 PyErr_SetString(PyExc_TypeError,
4576 "The fill character must be exactly one character long");
4577 Py_DECREF(uniobj);
4578 return 0;
4579 }
4580 unistr = PyUnicode_AS_UNICODE(uniobj);
4581 *fillcharloc = unistr[0];
4582 Py_DECREF(uniobj);
4583 return 1;
4584}
4585
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004586PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004587"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004589Return S centered in a Unicode string of length width. Padding is\n\
4590done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004591
4592static PyObject *
4593unicode_center(PyUnicodeObject *self, PyObject *args)
4594{
4595 int marg, left;
4596 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004597 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004599 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004600 return NULL;
4601
Tim Peters7a29bd52001-09-12 03:03:31 +00004602 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004603 Py_INCREF(self);
4604 return (PyObject*) self;
4605 }
4606
4607 marg = width - self->length;
4608 left = marg / 2 + (marg & width & 1);
4609
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004610 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611}
4612
Marc-André Lemburge5034372000-08-08 08:04:29 +00004613#if 0
4614
4615/* This code should go into some future Unicode collation support
4616 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004617 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004618
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004619/* speedy UTF-16 code point order comparison */
4620/* gleaned from: */
4621/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4622
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004623static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004624{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004625 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004626 0, 0, 0, 0, 0, 0, 0, 0,
4627 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004628 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004629};
4630
Guido van Rossumd57fd912000-03-10 22:53:23 +00004631static int
4632unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4633{
4634 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004635
Guido van Rossumd57fd912000-03-10 22:53:23 +00004636 Py_UNICODE *s1 = str1->str;
4637 Py_UNICODE *s2 = str2->str;
4638
4639 len1 = str1->length;
4640 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004641
Guido van Rossumd57fd912000-03-10 22:53:23 +00004642 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004643 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004644
4645 c1 = *s1++;
4646 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004647
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004648 if (c1 > (1<<11) * 26)
4649 c1 += utf16Fixup[c1>>11];
4650 if (c2 > (1<<11) * 26)
4651 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004652 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004653
4654 if (c1 != c2)
4655 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004656
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004657 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004658 }
4659
4660 return (len1 < len2) ? -1 : (len1 != len2);
4661}
4662
Marc-André Lemburge5034372000-08-08 08:04:29 +00004663#else
4664
4665static int
4666unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4667{
4668 register int len1, len2;
4669
4670 Py_UNICODE *s1 = str1->str;
4671 Py_UNICODE *s2 = str2->str;
4672
4673 len1 = str1->length;
4674 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004675
Marc-André Lemburge5034372000-08-08 08:04:29 +00004676 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004677 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004678
Fredrik Lundh45714e92001-06-26 16:39:36 +00004679 c1 = *s1++;
4680 c2 = *s2++;
4681
4682 if (c1 != c2)
4683 return (c1 < c2) ? -1 : 1;
4684
Marc-André Lemburge5034372000-08-08 08:04:29 +00004685 len1--; len2--;
4686 }
4687
4688 return (len1 < len2) ? -1 : (len1 != len2);
4689}
4690
4691#endif
4692
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693int PyUnicode_Compare(PyObject *left,
4694 PyObject *right)
4695{
4696 PyUnicodeObject *u = NULL, *v = NULL;
4697 int result;
4698
4699 /* Coerce the two arguments */
4700 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4701 if (u == NULL)
4702 goto onError;
4703 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4704 if (v == NULL)
4705 goto onError;
4706
Thomas Wouters7e474022000-07-16 12:04:32 +00004707 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004708 if (v == u) {
4709 Py_DECREF(u);
4710 Py_DECREF(v);
4711 return 0;
4712 }
4713
4714 result = unicode_compare(u, v);
4715
4716 Py_DECREF(u);
4717 Py_DECREF(v);
4718 return result;
4719
4720onError:
4721 Py_XDECREF(u);
4722 Py_XDECREF(v);
4723 return -1;
4724}
4725
Guido van Rossum403d68b2000-03-13 15:55:09 +00004726int PyUnicode_Contains(PyObject *container,
4727 PyObject *element)
4728{
4729 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004730 int result, size;
4731 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004732
4733 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004734 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004735 if (v == NULL) {
4736 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004737 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004738 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004739 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004740 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004741 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004742 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004743
Barry Warsaw817918c2002-08-06 16:58:21 +00004744 size = PyUnicode_GET_SIZE(v);
4745 rhs = PyUnicode_AS_UNICODE(v);
4746 lhs = PyUnicode_AS_UNICODE(u);
4747
Guido van Rossum403d68b2000-03-13 15:55:09 +00004748 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004749 if (size == 1) {
4750 end = lhs + PyUnicode_GET_SIZE(u);
4751 while (lhs < end) {
4752 if (*lhs++ == *rhs) {
4753 result = 1;
4754 break;
4755 }
4756 }
4757 }
4758 else {
4759 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4760 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004761 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004762 result = 1;
4763 break;
4764 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004765 }
4766 }
4767
4768 Py_DECREF(u);
4769 Py_DECREF(v);
4770 return result;
4771
4772onError:
4773 Py_XDECREF(u);
4774 Py_XDECREF(v);
4775 return -1;
4776}
4777
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778/* Concat to string or Unicode object giving a new Unicode object. */
4779
4780PyObject *PyUnicode_Concat(PyObject *left,
4781 PyObject *right)
4782{
4783 PyUnicodeObject *u = NULL, *v = NULL, *w;
4784
4785 /* Coerce the two arguments */
4786 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4787 if (u == NULL)
4788 goto onError;
4789 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4790 if (v == NULL)
4791 goto onError;
4792
4793 /* Shortcuts */
4794 if (v == unicode_empty) {
4795 Py_DECREF(v);
4796 return (PyObject *)u;
4797 }
4798 if (u == unicode_empty) {
4799 Py_DECREF(u);
4800 return (PyObject *)v;
4801 }
4802
4803 /* Concat the two Unicode strings */
4804 w = _PyUnicode_New(u->length + v->length);
4805 if (w == NULL)
4806 goto onError;
4807 Py_UNICODE_COPY(w->str, u->str, u->length);
4808 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4809
4810 Py_DECREF(u);
4811 Py_DECREF(v);
4812 return (PyObject *)w;
4813
4814onError:
4815 Py_XDECREF(u);
4816 Py_XDECREF(v);
4817 return NULL;
4818}
4819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004820PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004821"S.count(sub[, start[, end]]) -> int\n\
4822\n\
4823Return the number of occurrences of substring sub in Unicode string\n\
4824S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004825interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826
4827static PyObject *
4828unicode_count(PyUnicodeObject *self, PyObject *args)
4829{
4830 PyUnicodeObject *substring;
4831 int start = 0;
4832 int end = INT_MAX;
4833 PyObject *result;
4834
Guido van Rossumb8872e62000-05-09 14:14:27 +00004835 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4836 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837 return NULL;
4838
4839 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4840 (PyObject *)substring);
4841 if (substring == NULL)
4842 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004843
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844 if (start < 0)
4845 start += self->length;
4846 if (start < 0)
4847 start = 0;
4848 if (end > self->length)
4849 end = self->length;
4850 if (end < 0)
4851 end += self->length;
4852 if (end < 0)
4853 end = 0;
4854
4855 result = PyInt_FromLong((long) count(self, start, end, substring));
4856
4857 Py_DECREF(substring);
4858 return result;
4859}
4860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004861PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862"S.encode([encoding[,errors]]) -> string\n\
4863\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004864Return an encoded string version of S. Default encoding is the current\n\
4865default string encoding. errors may be given to set a different error\n\
4866handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004867a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4868'xmlcharrefreplace' as well as any other name registered with\n\
4869codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870
4871static PyObject *
4872unicode_encode(PyUnicodeObject *self, PyObject *args)
4873{
4874 char *encoding = NULL;
4875 char *errors = NULL;
4876 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4877 return NULL;
4878 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4879}
4880
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004881PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882"S.expandtabs([tabsize]) -> unicode\n\
4883\n\
4884Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004885If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886
4887static PyObject*
4888unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4889{
4890 Py_UNICODE *e;
4891 Py_UNICODE *p;
4892 Py_UNICODE *q;
4893 int i, j;
4894 PyUnicodeObject *u;
4895 int tabsize = 8;
4896
4897 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4898 return NULL;
4899
Thomas Wouters7e474022000-07-16 12:04:32 +00004900 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 i = j = 0;
4902 e = self->str + self->length;
4903 for (p = self->str; p < e; p++)
4904 if (*p == '\t') {
4905 if (tabsize > 0)
4906 j += tabsize - (j % tabsize);
4907 }
4908 else {
4909 j++;
4910 if (*p == '\n' || *p == '\r') {
4911 i += j;
4912 j = 0;
4913 }
4914 }
4915
4916 /* Second pass: create output string and fill it */
4917 u = _PyUnicode_New(i + j);
4918 if (!u)
4919 return NULL;
4920
4921 j = 0;
4922 q = u->str;
4923
4924 for (p = self->str; p < e; p++)
4925 if (*p == '\t') {
4926 if (tabsize > 0) {
4927 i = tabsize - (j % tabsize);
4928 j += i;
4929 while (i--)
4930 *q++ = ' ';
4931 }
4932 }
4933 else {
4934 j++;
4935 *q++ = *p;
4936 if (*p == '\n' || *p == '\r')
4937 j = 0;
4938 }
4939
4940 return (PyObject*) u;
4941}
4942
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004943PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944"S.find(sub [,start [,end]]) -> int\n\
4945\n\
4946Return the lowest index in S where substring sub is found,\n\
4947such that sub is contained within s[start,end]. Optional\n\
4948arguments start and end are interpreted as in slice notation.\n\
4949\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004950Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004951
4952static PyObject *
4953unicode_find(PyUnicodeObject *self, PyObject *args)
4954{
4955 PyUnicodeObject *substring;
4956 int start = 0;
4957 int end = INT_MAX;
4958 PyObject *result;
4959
Guido van Rossumb8872e62000-05-09 14:14:27 +00004960 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4961 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004962 return NULL;
4963 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4964 (PyObject *)substring);
4965 if (substring == NULL)
4966 return NULL;
4967
4968 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4969
4970 Py_DECREF(substring);
4971 return result;
4972}
4973
4974static PyObject *
4975unicode_getitem(PyUnicodeObject *self, int index)
4976{
4977 if (index < 0 || index >= self->length) {
4978 PyErr_SetString(PyExc_IndexError, "string index out of range");
4979 return NULL;
4980 }
4981
4982 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4983}
4984
4985static long
4986unicode_hash(PyUnicodeObject *self)
4987{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004988 /* Since Unicode objects compare equal to their ASCII string
4989 counterparts, they should use the individual character values
4990 as basis for their hash value. This is needed to assure that
4991 strings and Unicode objects behave in the same way as
4992 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993
Fredrik Lundhdde61642000-07-10 18:27:47 +00004994 register int len;
4995 register Py_UNICODE *p;
4996 register long x;
4997
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998 if (self->hash != -1)
4999 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005000 len = PyUnicode_GET_SIZE(self);
5001 p = PyUnicode_AS_UNICODE(self);
5002 x = *p << 7;
5003 while (--len >= 0)
5004 x = (1000003*x) ^ *p++;
5005 x ^= PyUnicode_GET_SIZE(self);
5006 if (x == -1)
5007 x = -2;
5008 self->hash = x;
5009 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005010}
5011
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005012PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013"S.index(sub [,start [,end]]) -> int\n\
5014\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005015Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016
5017static PyObject *
5018unicode_index(PyUnicodeObject *self, PyObject *args)
5019{
5020 int result;
5021 PyUnicodeObject *substring;
5022 int start = 0;
5023 int end = INT_MAX;
5024
Guido van Rossumb8872e62000-05-09 14:14:27 +00005025 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5026 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005027 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005028
Guido van Rossumd57fd912000-03-10 22:53:23 +00005029 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5030 (PyObject *)substring);
5031 if (substring == NULL)
5032 return NULL;
5033
5034 result = findstring(self, substring, start, end, 1);
5035
5036 Py_DECREF(substring);
5037 if (result < 0) {
5038 PyErr_SetString(PyExc_ValueError, "substring not found");
5039 return NULL;
5040 }
5041 return PyInt_FromLong(result);
5042}
5043
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005044PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005045"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005047Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005048at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049
5050static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005051unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052{
5053 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5054 register const Py_UNICODE *e;
5055 int cased;
5056
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057 /* Shortcut for single character strings */
5058 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005059 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005061 /* Special case for empty strings */
5062 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005063 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005064
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065 e = p + PyUnicode_GET_SIZE(self);
5066 cased = 0;
5067 for (; p < e; p++) {
5068 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005069
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005071 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072 else if (!cased && Py_UNICODE_ISLOWER(ch))
5073 cased = 1;
5074 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005075 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076}
5077
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005078PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005079"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005081Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005082at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083
5084static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005085unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086{
5087 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5088 register const Py_UNICODE *e;
5089 int cased;
5090
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 /* Shortcut for single character strings */
5092 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005093 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005094
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005095 /* Special case for empty strings */
5096 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005097 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005098
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099 e = p + PyUnicode_GET_SIZE(self);
5100 cased = 0;
5101 for (; p < e; p++) {
5102 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005103
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005105 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106 else if (!cased && Py_UNICODE_ISUPPER(ch))
5107 cased = 1;
5108 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005109 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110}
5111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005112PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005113"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005115Return True if S is a titlecased string and there is at least one\n\
5116character in S, i.e. upper- and titlecase characters may only\n\
5117follow uncased characters and lowercase characters only cased ones.\n\
5118Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119
5120static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005121unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122{
5123 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5124 register const Py_UNICODE *e;
5125 int cased, previous_is_cased;
5126
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127 /* Shortcut for single character strings */
5128 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005129 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5130 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005132 /* Special case for empty strings */
5133 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005134 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005135
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 e = p + PyUnicode_GET_SIZE(self);
5137 cased = 0;
5138 previous_is_cased = 0;
5139 for (; p < e; p++) {
5140 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005141
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5143 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005144 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 previous_is_cased = 1;
5146 cased = 1;
5147 }
5148 else if (Py_UNICODE_ISLOWER(ch)) {
5149 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005150 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 previous_is_cased = 1;
5152 cased = 1;
5153 }
5154 else
5155 previous_is_cased = 0;
5156 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005157 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158}
5159
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005160PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005161"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005163Return True if all characters in S are whitespace\n\
5164and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165
5166static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005167unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168{
5169 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5170 register const Py_UNICODE *e;
5171
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172 /* Shortcut for single character strings */
5173 if (PyUnicode_GET_SIZE(self) == 1 &&
5174 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005175 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005177 /* Special case for empty strings */
5178 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005179 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005180
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181 e = p + PyUnicode_GET_SIZE(self);
5182 for (; p < e; p++) {
5183 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005184 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005186 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187}
5188
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005189PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005190"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005191\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005192Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005193and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005194
5195static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005196unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005197{
5198 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5199 register const Py_UNICODE *e;
5200
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005201 /* Shortcut for single character strings */
5202 if (PyUnicode_GET_SIZE(self) == 1 &&
5203 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005204 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005205
5206 /* Special case for empty strings */
5207 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005208 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005209
5210 e = p + PyUnicode_GET_SIZE(self);
5211 for (; p < e; p++) {
5212 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005213 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005214 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005215 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005216}
5217
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005218PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005219"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005220\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005221Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005222and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005223
5224static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005225unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005226{
5227 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5228 register const Py_UNICODE *e;
5229
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005230 /* Shortcut for single character strings */
5231 if (PyUnicode_GET_SIZE(self) == 1 &&
5232 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005233 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005234
5235 /* Special case for empty strings */
5236 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005237 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005238
5239 e = p + PyUnicode_GET_SIZE(self);
5240 for (; p < e; p++) {
5241 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005242 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005243 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005244 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005245}
5246
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005247PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005248"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005250Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005251False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252
5253static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005254unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255{
5256 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5257 register const Py_UNICODE *e;
5258
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259 /* Shortcut for single character strings */
5260 if (PyUnicode_GET_SIZE(self) == 1 &&
5261 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005262 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005264 /* Special case for empty strings */
5265 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005266 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005267
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 e = p + PyUnicode_GET_SIZE(self);
5269 for (; p < e; p++) {
5270 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005271 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005273 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274}
5275
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005276PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005277"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005279Return True if all characters in S are digits\n\
5280and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281
5282static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005283unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284{
5285 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5286 register const Py_UNICODE *e;
5287
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288 /* Shortcut for single character strings */
5289 if (PyUnicode_GET_SIZE(self) == 1 &&
5290 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005291 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005293 /* Special case for empty strings */
5294 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005295 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005296
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 e = p + PyUnicode_GET_SIZE(self);
5298 for (; p < e; p++) {
5299 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005300 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005302 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303}
5304
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005305PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005306"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005308Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005309False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310
5311static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005312unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313{
5314 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5315 register const Py_UNICODE *e;
5316
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317 /* Shortcut for single character strings */
5318 if (PyUnicode_GET_SIZE(self) == 1 &&
5319 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005320 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005322 /* Special case for empty strings */
5323 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005324 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005325
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326 e = p + PyUnicode_GET_SIZE(self);
5327 for (; p < e; p++) {
5328 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005329 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005331 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332}
5333
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005334PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335"S.join(sequence) -> unicode\n\
5336\n\
5337Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005338sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339
5340static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005341unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005343 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344}
5345
5346static int
5347unicode_length(PyUnicodeObject *self)
5348{
5349 return self->length;
5350}
5351
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005352PyDoc_STRVAR(ljust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005353"S.ljust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354\n\
5355Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005356done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357
5358static PyObject *
5359unicode_ljust(PyUnicodeObject *self, PyObject *args)
5360{
5361 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005362 Py_UNICODE fillchar = ' ';
5363
5364 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365 return NULL;
5366
Tim Peters7a29bd52001-09-12 03:03:31 +00005367 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 Py_INCREF(self);
5369 return (PyObject*) self;
5370 }
5371
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005372 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373}
5374
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005375PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376"S.lower() -> unicode\n\
5377\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005378Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379
5380static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005381unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 return fixup(self, fixlower);
5384}
5385
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005386#define LEFTSTRIP 0
5387#define RIGHTSTRIP 1
5388#define BOTHSTRIP 2
5389
5390/* Arrays indexed by above */
5391static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5392
5393#define STRIPNAME(i) (stripformat[i]+3)
5394
5395static const Py_UNICODE *
5396unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5397{
Tim Peters030a5ce2002-04-22 19:00:10 +00005398 size_t i;
5399 for (i = 0; i < n; ++i)
5400 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005401 return s+i;
5402 return NULL;
5403}
5404
5405/* externally visible for str.strip(unicode) */
5406PyObject *
5407_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5408{
5409 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5410 int len = PyUnicode_GET_SIZE(self);
5411 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5412 int seplen = PyUnicode_GET_SIZE(sepobj);
5413 int i, j;
5414
5415 i = 0;
5416 if (striptype != RIGHTSTRIP) {
5417 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5418 i++;
5419 }
5420 }
5421
5422 j = len;
5423 if (striptype != LEFTSTRIP) {
5424 do {
5425 j--;
5426 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5427 j++;
5428 }
5429
5430 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5431 Py_INCREF(self);
5432 return (PyObject*)self;
5433 }
5434 else
5435 return PyUnicode_FromUnicode(s+i, j-i);
5436}
5437
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438
5439static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005440do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005442 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5443 int len = PyUnicode_GET_SIZE(self), i, j;
5444
5445 i = 0;
5446 if (striptype != RIGHTSTRIP) {
5447 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5448 i++;
5449 }
5450 }
5451
5452 j = len;
5453 if (striptype != LEFTSTRIP) {
5454 do {
5455 j--;
5456 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5457 j++;
5458 }
5459
5460 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5461 Py_INCREF(self);
5462 return (PyObject*)self;
5463 }
5464 else
5465 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466}
5467
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005468
5469static PyObject *
5470do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5471{
5472 PyObject *sep = NULL;
5473
5474 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5475 return NULL;
5476
5477 if (sep != NULL && sep != Py_None) {
5478 if (PyUnicode_Check(sep))
5479 return _PyUnicode_XStrip(self, striptype, sep);
5480 else if (PyString_Check(sep)) {
5481 PyObject *res;
5482 sep = PyUnicode_FromObject(sep);
5483 if (sep==NULL)
5484 return NULL;
5485 res = _PyUnicode_XStrip(self, striptype, sep);
5486 Py_DECREF(sep);
5487 return res;
5488 }
5489 else {
5490 PyErr_Format(PyExc_TypeError,
5491 "%s arg must be None, unicode or str",
5492 STRIPNAME(striptype));
5493 return NULL;
5494 }
5495 }
5496
5497 return do_strip(self, striptype);
5498}
5499
5500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005501PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005502"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005503\n\
5504Return a copy of the string S with leading and trailing\n\
5505whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005506If chars is given and not None, remove characters in chars instead.\n\
5507If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005508
5509static PyObject *
5510unicode_strip(PyUnicodeObject *self, PyObject *args)
5511{
5512 if (PyTuple_GET_SIZE(args) == 0)
5513 return do_strip(self, BOTHSTRIP); /* Common case */
5514 else
5515 return do_argstrip(self, BOTHSTRIP, args);
5516}
5517
5518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005519PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005520"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005521\n\
5522Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005523If chars is given and not None, remove characters in chars instead.\n\
5524If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005525
5526static PyObject *
5527unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5528{
5529 if (PyTuple_GET_SIZE(args) == 0)
5530 return do_strip(self, LEFTSTRIP); /* Common case */
5531 else
5532 return do_argstrip(self, LEFTSTRIP, args);
5533}
5534
5535
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005536PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005537"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005538\n\
5539Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005540If chars is given and not None, remove characters in chars instead.\n\
5541If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005542
5543static PyObject *
5544unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5545{
5546 if (PyTuple_GET_SIZE(args) == 0)
5547 return do_strip(self, RIGHTSTRIP); /* Common case */
5548 else
5549 return do_argstrip(self, RIGHTSTRIP, args);
5550}
5551
5552
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553static PyObject*
5554unicode_repeat(PyUnicodeObject *str, int len)
5555{
5556 PyUnicodeObject *u;
5557 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005558 int nchars;
5559 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560
5561 if (len < 0)
5562 len = 0;
5563
Tim Peters7a29bd52001-09-12 03:03:31 +00005564 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565 /* no repeat, return original string */
5566 Py_INCREF(str);
5567 return (PyObject*) str;
5568 }
Tim Peters8f422462000-09-09 06:13:41 +00005569
5570 /* ensure # of chars needed doesn't overflow int and # of bytes
5571 * needed doesn't overflow size_t
5572 */
5573 nchars = len * str->length;
5574 if (len && nchars / len != str->length) {
5575 PyErr_SetString(PyExc_OverflowError,
5576 "repeated string is too long");
5577 return NULL;
5578 }
5579 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5580 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5581 PyErr_SetString(PyExc_OverflowError,
5582 "repeated string is too long");
5583 return NULL;
5584 }
5585 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 if (!u)
5587 return NULL;
5588
5589 p = u->str;
5590
5591 while (len-- > 0) {
5592 Py_UNICODE_COPY(p, str->str, str->length);
5593 p += str->length;
5594 }
5595
5596 return (PyObject*) u;
5597}
5598
5599PyObject *PyUnicode_Replace(PyObject *obj,
5600 PyObject *subobj,
5601 PyObject *replobj,
5602 int maxcount)
5603{
5604 PyObject *self;
5605 PyObject *str1;
5606 PyObject *str2;
5607 PyObject *result;
5608
5609 self = PyUnicode_FromObject(obj);
5610 if (self == NULL)
5611 return NULL;
5612 str1 = PyUnicode_FromObject(subobj);
5613 if (str1 == NULL) {
5614 Py_DECREF(self);
5615 return NULL;
5616 }
5617 str2 = PyUnicode_FromObject(replobj);
5618 if (str2 == NULL) {
5619 Py_DECREF(self);
5620 Py_DECREF(str1);
5621 return NULL;
5622 }
Tim Petersced69f82003-09-16 20:30:58 +00005623 result = replace((PyUnicodeObject *)self,
5624 (PyUnicodeObject *)str1,
5625 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626 maxcount);
5627 Py_DECREF(self);
5628 Py_DECREF(str1);
5629 Py_DECREF(str2);
5630 return result;
5631}
5632
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005633PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634"S.replace (old, new[, maxsplit]) -> unicode\n\
5635\n\
5636Return a copy of S with all occurrences of substring\n\
5637old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005638given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639
5640static PyObject*
5641unicode_replace(PyUnicodeObject *self, PyObject *args)
5642{
5643 PyUnicodeObject *str1;
5644 PyUnicodeObject *str2;
5645 int maxcount = -1;
5646 PyObject *result;
5647
5648 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5649 return NULL;
5650 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5651 if (str1 == NULL)
5652 return NULL;
5653 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005654 if (str2 == NULL) {
5655 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658
5659 result = replace(self, str1, str2, maxcount);
5660
5661 Py_DECREF(str1);
5662 Py_DECREF(str2);
5663 return result;
5664}
5665
5666static
5667PyObject *unicode_repr(PyObject *unicode)
5668{
5669 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5670 PyUnicode_GET_SIZE(unicode),
5671 1);
5672}
5673
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005674PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675"S.rfind(sub [,start [,end]]) -> int\n\
5676\n\
5677Return the highest index in S where substring sub is found,\n\
5678such that sub is contained within s[start,end]. Optional\n\
5679arguments start and end are interpreted as in slice notation.\n\
5680\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005681Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682
5683static PyObject *
5684unicode_rfind(PyUnicodeObject *self, PyObject *args)
5685{
5686 PyUnicodeObject *substring;
5687 int start = 0;
5688 int end = INT_MAX;
5689 PyObject *result;
5690
Guido van Rossumb8872e62000-05-09 14:14:27 +00005691 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5692 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 return NULL;
5694 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5695 (PyObject *)substring);
5696 if (substring == NULL)
5697 return NULL;
5698
5699 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5700
5701 Py_DECREF(substring);
5702 return result;
5703}
5704
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005705PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706"S.rindex(sub [,start [,end]]) -> int\n\
5707\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005708Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709
5710static PyObject *
5711unicode_rindex(PyUnicodeObject *self, PyObject *args)
5712{
5713 int result;
5714 PyUnicodeObject *substring;
5715 int start = 0;
5716 int end = INT_MAX;
5717
Guido van Rossumb8872e62000-05-09 14:14:27 +00005718 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5719 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 return NULL;
5721 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5722 (PyObject *)substring);
5723 if (substring == NULL)
5724 return NULL;
5725
5726 result = findstring(self, substring, start, end, -1);
5727
5728 Py_DECREF(substring);
5729 if (result < 0) {
5730 PyErr_SetString(PyExc_ValueError, "substring not found");
5731 return NULL;
5732 }
5733 return PyInt_FromLong(result);
5734}
5735
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005736PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005737"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738\n\
5739Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005740done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741
5742static PyObject *
5743unicode_rjust(PyUnicodeObject *self, PyObject *args)
5744{
5745 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005746 Py_UNICODE fillchar = ' ';
5747
5748 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749 return NULL;
5750
Tim Peters7a29bd52001-09-12 03:03:31 +00005751 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752 Py_INCREF(self);
5753 return (PyObject*) self;
5754 }
5755
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005756 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757}
5758
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759static PyObject*
5760unicode_slice(PyUnicodeObject *self, int start, int end)
5761{
5762 /* standard clamping */
5763 if (start < 0)
5764 start = 0;
5765 if (end < 0)
5766 end = 0;
5767 if (end > self->length)
5768 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005769 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 /* full slice, return original string */
5771 Py_INCREF(self);
5772 return (PyObject*) self;
5773 }
5774 if (start > end)
5775 start = end;
5776 /* copy slice */
5777 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5778 end - start);
5779}
5780
5781PyObject *PyUnicode_Split(PyObject *s,
5782 PyObject *sep,
5783 int maxsplit)
5784{
5785 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005786
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 s = PyUnicode_FromObject(s);
5788 if (s == NULL)
5789 return NULL;
5790 if (sep != NULL) {
5791 sep = PyUnicode_FromObject(sep);
5792 if (sep == NULL) {
5793 Py_DECREF(s);
5794 return NULL;
5795 }
5796 }
5797
5798 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5799
5800 Py_DECREF(s);
5801 Py_XDECREF(sep);
5802 return result;
5803}
5804
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005805PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806"S.split([sep [,maxsplit]]) -> list of strings\n\
5807\n\
5808Return a list of the words in S, using sep as the\n\
5809delimiter string. If maxsplit is given, at most maxsplit\n\
5810splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005811is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812
5813static PyObject*
5814unicode_split(PyUnicodeObject *self, PyObject *args)
5815{
5816 PyObject *substring = Py_None;
5817 int maxcount = -1;
5818
5819 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5820 return NULL;
5821
5822 if (substring == Py_None)
5823 return split(self, NULL, maxcount);
5824 else if (PyUnicode_Check(substring))
5825 return split(self, (PyUnicodeObject *)substring, maxcount);
5826 else
5827 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5828}
5829
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005830PyObject *PyUnicode_RSplit(PyObject *s,
5831 PyObject *sep,
5832 int maxsplit)
5833{
5834 PyObject *result;
5835
5836 s = PyUnicode_FromObject(s);
5837 if (s == NULL)
5838 return NULL;
5839 if (sep != NULL) {
5840 sep = PyUnicode_FromObject(sep);
5841 if (sep == NULL) {
5842 Py_DECREF(s);
5843 return NULL;
5844 }
5845 }
5846
5847 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5848
5849 Py_DECREF(s);
5850 Py_XDECREF(sep);
5851 return result;
5852}
5853
5854PyDoc_STRVAR(rsplit__doc__,
5855"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
5856\n\
5857Return a list of the words in S, using sep as the\n\
5858delimiter string, starting at the end of the string and\n\
5859working to the front. If maxsplit is given, at most maxsplit\n\
5860splits are done. If sep is not specified, any whitespace string\n\
5861is a separator.");
5862
5863static PyObject*
5864unicode_rsplit(PyUnicodeObject *self, PyObject *args)
5865{
5866 PyObject *substring = Py_None;
5867 int maxcount = -1;
5868
5869 if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
5870 return NULL;
5871
5872 if (substring == Py_None)
5873 return rsplit(self, NULL, maxcount);
5874 else if (PyUnicode_Check(substring))
5875 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
5876 else
5877 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
5878}
5879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005880PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00005881"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882\n\
5883Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00005884Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005885is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886
5887static PyObject*
5888unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5889{
Guido van Rossum86662912000-04-11 15:38:46 +00005890 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891
Guido van Rossum86662912000-04-11 15:38:46 +00005892 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 return NULL;
5894
Guido van Rossum86662912000-04-11 15:38:46 +00005895 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896}
5897
5898static
5899PyObject *unicode_str(PyUnicodeObject *self)
5900{
Fred Drakee4315f52000-05-09 19:53:39 +00005901 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902}
5903
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005904PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905"S.swapcase() -> unicode\n\
5906\n\
5907Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005908and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909
5910static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005911unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 return fixup(self, fixswapcase);
5914}
5915
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005916PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917"S.translate(table) -> unicode\n\
5918\n\
5919Return a copy of the string S, where all characters have been mapped\n\
5920through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00005921Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5922Unmapped characters are left untouched. Characters mapped to None\n\
5923are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924
5925static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005926unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927{
Tim Petersced69f82003-09-16 20:30:58 +00005928 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00005930 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 "ignore");
5932}
5933
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005934PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935"S.upper() -> unicode\n\
5936\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005937Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938
5939static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005940unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 return fixup(self, fixupper);
5943}
5944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005945PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946"S.zfill(width) -> unicode\n\
5947\n\
5948Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005949of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950
5951static PyObject *
5952unicode_zfill(PyUnicodeObject *self, PyObject *args)
5953{
5954 int fill;
5955 PyUnicodeObject *u;
5956
5957 int width;
5958 if (!PyArg_ParseTuple(args, "i:zfill", &width))
5959 return NULL;
5960
5961 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00005962 if (PyUnicode_CheckExact(self)) {
5963 Py_INCREF(self);
5964 return (PyObject*) self;
5965 }
5966 else
5967 return PyUnicode_FromUnicode(
5968 PyUnicode_AS_UNICODE(self),
5969 PyUnicode_GET_SIZE(self)
5970 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 }
5972
5973 fill = width - self->length;
5974
5975 u = pad(self, fill, 0, '0');
5976
Walter Dörwald068325e2002-04-15 13:36:47 +00005977 if (u == NULL)
5978 return NULL;
5979
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 if (u->str[fill] == '+' || u->str[fill] == '-') {
5981 /* move sign to beginning of string */
5982 u->str[0] = u->str[fill];
5983 u->str[fill] = '0';
5984 }
5985
5986 return (PyObject*) u;
5987}
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988
5989#if 0
5990static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005991unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 return PyInt_FromLong(unicode_freelist_size);
5994}
5995#endif
5996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005997PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005998"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006000Return True if S starts with the specified prefix, False otherwise.\n\
6001With optional start, test S beginning at that position.\n\
6002With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003
6004static PyObject *
6005unicode_startswith(PyUnicodeObject *self,
6006 PyObject *args)
6007{
6008 PyUnicodeObject *substring;
6009 int start = 0;
6010 int end = INT_MAX;
6011 PyObject *result;
6012
Guido van Rossumb8872e62000-05-09 14:14:27 +00006013 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6014 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 return NULL;
6016 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6017 (PyObject *)substring);
6018 if (substring == NULL)
6019 return NULL;
6020
Guido van Rossum77f6a652002-04-03 22:41:51 +00006021 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022
6023 Py_DECREF(substring);
6024 return result;
6025}
6026
6027
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006028PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006029"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006031Return True if S ends with the specified suffix, False otherwise.\n\
6032With optional start, test S beginning at that position.\n\
6033With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034
6035static PyObject *
6036unicode_endswith(PyUnicodeObject *self,
6037 PyObject *args)
6038{
6039 PyUnicodeObject *substring;
6040 int start = 0;
6041 int end = INT_MAX;
6042 PyObject *result;
6043
Guido van Rossumb8872e62000-05-09 14:14:27 +00006044 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6045 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 return NULL;
6047 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6048 (PyObject *)substring);
6049 if (substring == NULL)
6050 return NULL;
6051
Guido van Rossum77f6a652002-04-03 22:41:51 +00006052 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053
6054 Py_DECREF(substring);
6055 return result;
6056}
6057
6058
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006059
6060static PyObject *
6061unicode_getnewargs(PyUnicodeObject *v)
6062{
6063 return Py_BuildValue("(u#)", v->str, v->length);
6064}
6065
6066
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067static PyMethodDef unicode_methods[] = {
6068
6069 /* Order is according to common usage: often used methods should
6070 appear first, since lookup is done sequentially. */
6071
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006072 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6073 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6074 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006075 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006076 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6077 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6078 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6079 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6080 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6081 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6082 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6083 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6084 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6085 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006086 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006087/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6088 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6089 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6090 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006091 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006092 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006093 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006094 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6095 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6096 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6097 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6098 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6099 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6100 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6101 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6102 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6103 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6104 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6105 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6106 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6107 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006108 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006109#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006110 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111#endif
6112
6113#if 0
6114 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006115 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116#endif
6117
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006118 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119 {NULL, NULL}
6120};
6121
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006122static PyObject *
6123unicode_mod(PyObject *v, PyObject *w)
6124{
6125 if (!PyUnicode_Check(v)) {
6126 Py_INCREF(Py_NotImplemented);
6127 return Py_NotImplemented;
6128 }
6129 return PyUnicode_Format(v, w);
6130}
6131
6132static PyNumberMethods unicode_as_number = {
6133 0, /*nb_add*/
6134 0, /*nb_subtract*/
6135 0, /*nb_multiply*/
6136 0, /*nb_divide*/
6137 unicode_mod, /*nb_remainder*/
6138};
6139
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140static PySequenceMethods unicode_as_sequence = {
6141 (inquiry) unicode_length, /* sq_length */
6142 (binaryfunc) PyUnicode_Concat, /* sq_concat */
6143 (intargfunc) unicode_repeat, /* sq_repeat */
6144 (intargfunc) unicode_getitem, /* sq_item */
6145 (intintargfunc) unicode_slice, /* sq_slice */
6146 0, /* sq_ass_item */
6147 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00006148 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149};
6150
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006151static PyObject*
6152unicode_subscript(PyUnicodeObject* self, PyObject* item)
6153{
6154 if (PyInt_Check(item)) {
6155 long i = PyInt_AS_LONG(item);
6156 if (i < 0)
6157 i += PyString_GET_SIZE(self);
6158 return unicode_getitem(self, i);
6159 } else if (PyLong_Check(item)) {
6160 long i = PyLong_AsLong(item);
6161 if (i == -1 && PyErr_Occurred())
6162 return NULL;
6163 if (i < 0)
6164 i += PyString_GET_SIZE(self);
6165 return unicode_getitem(self, i);
6166 } else if (PySlice_Check(item)) {
6167 int start, stop, step, slicelength, cur, i;
6168 Py_UNICODE* source_buf;
6169 Py_UNICODE* result_buf;
6170 PyObject* result;
6171
6172 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
6173 &start, &stop, &step, &slicelength) < 0) {
6174 return NULL;
6175 }
6176
6177 if (slicelength <= 0) {
6178 return PyUnicode_FromUnicode(NULL, 0);
6179 } else {
6180 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6181 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
6182
6183 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6184 result_buf[i] = source_buf[cur];
6185 }
Tim Petersced69f82003-09-16 20:30:58 +00006186
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006187 result = PyUnicode_FromUnicode(result_buf, slicelength);
6188 PyMem_FREE(result_buf);
6189 return result;
6190 }
6191 } else {
6192 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6193 return NULL;
6194 }
6195}
6196
6197static PyMappingMethods unicode_as_mapping = {
6198 (inquiry)unicode_length, /* mp_length */
6199 (binaryfunc)unicode_subscript, /* mp_subscript */
6200 (objobjargproc)0, /* mp_ass_subscript */
6201};
6202
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203static int
6204unicode_buffer_getreadbuf(PyUnicodeObject *self,
6205 int index,
6206 const void **ptr)
6207{
6208 if (index != 0) {
6209 PyErr_SetString(PyExc_SystemError,
6210 "accessing non-existent unicode segment");
6211 return -1;
6212 }
6213 *ptr = (void *) self->str;
6214 return PyUnicode_GET_DATA_SIZE(self);
6215}
6216
6217static int
6218unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6219 const void **ptr)
6220{
6221 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006222 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223 return -1;
6224}
6225
6226static int
6227unicode_buffer_getsegcount(PyUnicodeObject *self,
6228 int *lenp)
6229{
6230 if (lenp)
6231 *lenp = PyUnicode_GET_DATA_SIZE(self);
6232 return 1;
6233}
6234
6235static int
6236unicode_buffer_getcharbuf(PyUnicodeObject *self,
6237 int index,
6238 const void **ptr)
6239{
6240 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006241
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242 if (index != 0) {
6243 PyErr_SetString(PyExc_SystemError,
6244 "accessing non-existent unicode segment");
6245 return -1;
6246 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006247 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248 if (str == NULL)
6249 return -1;
6250 *ptr = (void *) PyString_AS_STRING(str);
6251 return PyString_GET_SIZE(str);
6252}
6253
6254/* Helpers for PyUnicode_Format() */
6255
6256static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006257getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258{
6259 int argidx = *p_argidx;
6260 if (argidx < arglen) {
6261 (*p_argidx)++;
6262 if (arglen < 0)
6263 return args;
6264 else
6265 return PyTuple_GetItem(args, argidx);
6266 }
6267 PyErr_SetString(PyExc_TypeError,
6268 "not enough arguments for format string");
6269 return NULL;
6270}
6271
6272#define F_LJUST (1<<0)
6273#define F_SIGN (1<<1)
6274#define F_BLANK (1<<2)
6275#define F_ALT (1<<3)
6276#define F_ZERO (1<<4)
6277
6278static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280{
6281 register int i;
6282 int len;
6283 va_list va;
6284 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286
6287 /* First, format the string as char array, then expand to Py_UNICODE
6288 array. */
6289 charbuffer = (char *)buffer;
6290 len = vsprintf(charbuffer, format, va);
6291 for (i = len - 1; i >= 0; i--)
6292 buffer[i] = (Py_UNICODE) charbuffer[i];
6293
6294 va_end(va);
6295 return len;
6296}
6297
Guido van Rossum078151d2002-08-11 04:24:12 +00006298/* XXX To save some code duplication, formatfloat/long/int could have been
6299 shared with stringobject.c, converting from 8-bit to Unicode after the
6300 formatting is done. */
6301
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302static int
6303formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006304 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 int flags,
6306 int prec,
6307 int type,
6308 PyObject *v)
6309{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006310 /* fmt = '%#.' + `prec` + `type`
6311 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 char fmt[20];
6313 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006314
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315 x = PyFloat_AsDouble(v);
6316 if (x == -1.0 && PyErr_Occurred())
6317 return -1;
6318 if (prec < 0)
6319 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6321 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006322 /* Worst case length calc to ensure no buffer overrun:
6323
6324 'g' formats:
6325 fmt = %#.<prec>g
6326 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6327 for any double rep.)
6328 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6329
6330 'f' formats:
6331 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6332 len = 1 + 50 + 1 + prec = 52 + prec
6333
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006334 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006335 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006336
6337 */
6338 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6339 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006340 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006341 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006342 return -1;
6343 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006344 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6345 (flags&F_ALT) ? "#" : "",
6346 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347 return usprintf(buf, fmt, x);
6348}
6349
Tim Peters38fd5b62000-09-21 05:43:11 +00006350static PyObject*
6351formatlong(PyObject *val, int flags, int prec, int type)
6352{
6353 char *buf;
6354 int i, len;
6355 PyObject *str; /* temporary string object. */
6356 PyUnicodeObject *result;
6357
6358 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6359 if (!str)
6360 return NULL;
6361 result = _PyUnicode_New(len);
6362 for (i = 0; i < len; i++)
6363 result->str[i] = buf[i];
6364 result->str[len] = 0;
6365 Py_DECREF(str);
6366 return (PyObject*)result;
6367}
6368
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369static int
6370formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006371 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372 int flags,
6373 int prec,
6374 int type,
6375 PyObject *v)
6376{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006377 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006378 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6379 * + 1 + 1
6380 * = 24
6381 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006382 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006383 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384 long x;
6385
6386 x = PyInt_AsLong(v);
6387 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006388 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006389 if (x < 0 && type == 'u') {
6390 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006391 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006392 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6393 sign = "-";
6394 else
6395 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006397 prec = 1;
6398
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006399 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6400 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006401 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006402 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006403 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006404 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006405 return -1;
6406 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006407
6408 if ((flags & F_ALT) &&
6409 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006410 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006411 * of issues that cause pain:
6412 * - when 0 is being converted, the C standard leaves off
6413 * the '0x' or '0X', which is inconsistent with other
6414 * %#x/%#X conversions and inconsistent with Python's
6415 * hex() function
6416 * - there are platforms that violate the standard and
6417 * convert 0 with the '0x' or '0X'
6418 * (Metrowerks, Compaq Tru64)
6419 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006420 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006421 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006422 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006423 * We can achieve the desired consistency by inserting our
6424 * own '0x' or '0X' prefix, and substituting %x/%X in place
6425 * of %#x/%#X.
6426 *
6427 * Note that this is the same approach as used in
6428 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006429 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006430 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6431 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006432 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006433 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006434 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6435 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006436 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006437 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006438 if (sign[0])
6439 return usprintf(buf, fmt, -x);
6440 else
6441 return usprintf(buf, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442}
6443
6444static int
6445formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006446 size_t buflen,
6447 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006449 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006450 if (PyUnicode_Check(v)) {
6451 if (PyUnicode_GET_SIZE(v) != 1)
6452 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006456 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006457 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006458 goto onError;
6459 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6460 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461
6462 else {
6463 /* Integer input truncated to a character */
6464 long x;
6465 x = PyInt_AsLong(v);
6466 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006467 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006468#ifdef Py_UNICODE_WIDE
6469 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006470 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006471 "%c arg not in range(0x110000) "
6472 "(wide Python build)");
6473 return -1;
6474 }
6475#else
6476 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006477 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006478 "%c arg not in range(0x10000) "
6479 "(narrow Python build)");
6480 return -1;
6481 }
6482#endif
6483 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484 }
6485 buf[1] = '\0';
6486 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006487
6488 onError:
6489 PyErr_SetString(PyExc_TypeError,
6490 "%c requires int or char");
6491 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492}
6493
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006494/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6495
6496 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6497 chars are formatted. XXX This is a magic number. Each formatting
6498 routine does bounds checking to ensure no overflow, but a better
6499 solution may be to malloc a buffer of appropriate size for each
6500 format. For now, the current solution is sufficient.
6501*/
6502#define FORMATBUFLEN (size_t)120
6503
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504PyObject *PyUnicode_Format(PyObject *format,
6505 PyObject *args)
6506{
6507 Py_UNICODE *fmt, *res;
6508 int fmtcnt, rescnt, reslen, arglen, argidx;
6509 int args_owned = 0;
6510 PyUnicodeObject *result = NULL;
6511 PyObject *dict = NULL;
6512 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006513
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 if (format == NULL || args == NULL) {
6515 PyErr_BadInternalCall();
6516 return NULL;
6517 }
6518 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006519 if (uformat == NULL)
6520 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521 fmt = PyUnicode_AS_UNICODE(uformat);
6522 fmtcnt = PyUnicode_GET_SIZE(uformat);
6523
6524 reslen = rescnt = fmtcnt + 100;
6525 result = _PyUnicode_New(reslen);
6526 if (result == NULL)
6527 goto onError;
6528 res = PyUnicode_AS_UNICODE(result);
6529
6530 if (PyTuple_Check(args)) {
6531 arglen = PyTuple_Size(args);
6532 argidx = 0;
6533 }
6534 else {
6535 arglen = -1;
6536 argidx = -2;
6537 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006538 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6539 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540 dict = args;
6541
6542 while (--fmtcnt >= 0) {
6543 if (*fmt != '%') {
6544 if (--rescnt < 0) {
6545 rescnt = fmtcnt + 100;
6546 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006547 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 return NULL;
6549 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6550 --rescnt;
6551 }
6552 *res++ = *fmt++;
6553 }
6554 else {
6555 /* Got a format specifier */
6556 int flags = 0;
6557 int width = -1;
6558 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559 Py_UNICODE c = '\0';
6560 Py_UNICODE fill;
6561 PyObject *v = NULL;
6562 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006563 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564 Py_UNICODE sign;
6565 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006566 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567
6568 fmt++;
6569 if (*fmt == '(') {
6570 Py_UNICODE *keystart;
6571 int keylen;
6572 PyObject *key;
6573 int pcount = 1;
6574
6575 if (dict == NULL) {
6576 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006577 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578 goto onError;
6579 }
6580 ++fmt;
6581 --fmtcnt;
6582 keystart = fmt;
6583 /* Skip over balanced parentheses */
6584 while (pcount > 0 && --fmtcnt >= 0) {
6585 if (*fmt == ')')
6586 --pcount;
6587 else if (*fmt == '(')
6588 ++pcount;
6589 fmt++;
6590 }
6591 keylen = fmt - keystart - 1;
6592 if (fmtcnt < 0 || pcount > 0) {
6593 PyErr_SetString(PyExc_ValueError,
6594 "incomplete format key");
6595 goto onError;
6596 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006597#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006598 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 then looked up since Python uses strings to hold
6600 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006601 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 key = PyUnicode_EncodeUTF8(keystart,
6603 keylen,
6604 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006605#else
6606 key = PyUnicode_FromUnicode(keystart, keylen);
6607#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 if (key == NULL)
6609 goto onError;
6610 if (args_owned) {
6611 Py_DECREF(args);
6612 args_owned = 0;
6613 }
6614 args = PyObject_GetItem(dict, key);
6615 Py_DECREF(key);
6616 if (args == NULL) {
6617 goto onError;
6618 }
6619 args_owned = 1;
6620 arglen = -1;
6621 argidx = -2;
6622 }
6623 while (--fmtcnt >= 0) {
6624 switch (c = *fmt++) {
6625 case '-': flags |= F_LJUST; continue;
6626 case '+': flags |= F_SIGN; continue;
6627 case ' ': flags |= F_BLANK; continue;
6628 case '#': flags |= F_ALT; continue;
6629 case '0': flags |= F_ZERO; continue;
6630 }
6631 break;
6632 }
6633 if (c == '*') {
6634 v = getnextarg(args, arglen, &argidx);
6635 if (v == NULL)
6636 goto onError;
6637 if (!PyInt_Check(v)) {
6638 PyErr_SetString(PyExc_TypeError,
6639 "* wants int");
6640 goto onError;
6641 }
6642 width = PyInt_AsLong(v);
6643 if (width < 0) {
6644 flags |= F_LJUST;
6645 width = -width;
6646 }
6647 if (--fmtcnt >= 0)
6648 c = *fmt++;
6649 }
6650 else if (c >= '0' && c <= '9') {
6651 width = c - '0';
6652 while (--fmtcnt >= 0) {
6653 c = *fmt++;
6654 if (c < '0' || c > '9')
6655 break;
6656 if ((width*10) / 10 != width) {
6657 PyErr_SetString(PyExc_ValueError,
6658 "width too big");
6659 goto onError;
6660 }
6661 width = width*10 + (c - '0');
6662 }
6663 }
6664 if (c == '.') {
6665 prec = 0;
6666 if (--fmtcnt >= 0)
6667 c = *fmt++;
6668 if (c == '*') {
6669 v = getnextarg(args, arglen, &argidx);
6670 if (v == NULL)
6671 goto onError;
6672 if (!PyInt_Check(v)) {
6673 PyErr_SetString(PyExc_TypeError,
6674 "* wants int");
6675 goto onError;
6676 }
6677 prec = PyInt_AsLong(v);
6678 if (prec < 0)
6679 prec = 0;
6680 if (--fmtcnt >= 0)
6681 c = *fmt++;
6682 }
6683 else if (c >= '0' && c <= '9') {
6684 prec = c - '0';
6685 while (--fmtcnt >= 0) {
6686 c = Py_CHARMASK(*fmt++);
6687 if (c < '0' || c > '9')
6688 break;
6689 if ((prec*10) / 10 != prec) {
6690 PyErr_SetString(PyExc_ValueError,
6691 "prec too big");
6692 goto onError;
6693 }
6694 prec = prec*10 + (c - '0');
6695 }
6696 }
6697 } /* prec */
6698 if (fmtcnt >= 0) {
6699 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700 if (--fmtcnt >= 0)
6701 c = *fmt++;
6702 }
6703 }
6704 if (fmtcnt < 0) {
6705 PyErr_SetString(PyExc_ValueError,
6706 "incomplete format");
6707 goto onError;
6708 }
6709 if (c != '%') {
6710 v = getnextarg(args, arglen, &argidx);
6711 if (v == NULL)
6712 goto onError;
6713 }
6714 sign = 0;
6715 fill = ' ';
6716 switch (c) {
6717
6718 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006719 pbuf = formatbuf;
6720 /* presume that buffer length is at least 1 */
6721 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 len = 1;
6723 break;
6724
6725 case 's':
6726 case 'r':
6727 if (PyUnicode_Check(v) && c == 's') {
6728 temp = v;
6729 Py_INCREF(temp);
6730 }
6731 else {
6732 PyObject *unicode;
6733 if (c == 's')
6734 temp = PyObject_Str(v);
6735 else
6736 temp = PyObject_Repr(v);
6737 if (temp == NULL)
6738 goto onError;
6739 if (!PyString_Check(temp)) {
6740 /* XXX Note: this should never happen, since
6741 PyObject_Repr() and PyObject_Str() assure
6742 this */
6743 Py_DECREF(temp);
6744 PyErr_SetString(PyExc_TypeError,
6745 "%s argument has non-string str()");
6746 goto onError;
6747 }
Fred Drakee4315f52000-05-09 19:53:39 +00006748 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006750 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 "strict");
6752 Py_DECREF(temp);
6753 temp = unicode;
6754 if (temp == NULL)
6755 goto onError;
6756 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006757 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758 len = PyUnicode_GET_SIZE(temp);
6759 if (prec >= 0 && len > prec)
6760 len = prec;
6761 break;
6762
6763 case 'i':
6764 case 'd':
6765 case 'u':
6766 case 'o':
6767 case 'x':
6768 case 'X':
6769 if (c == 'i')
6770 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006771 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006772 temp = formatlong(v, flags, prec, c);
6773 if (!temp)
6774 goto onError;
6775 pbuf = PyUnicode_AS_UNICODE(temp);
6776 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00006777 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006779 else {
6780 pbuf = formatbuf;
6781 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6782 flags, prec, c, v);
6783 if (len < 0)
6784 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006785 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006786 }
6787 if (flags & F_ZERO)
6788 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789 break;
6790
6791 case 'e':
6792 case 'E':
6793 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006794 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 case 'g':
6796 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006797 if (c == 'F')
6798 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006799 pbuf = formatbuf;
6800 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6801 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802 if (len < 0)
6803 goto onError;
6804 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006805 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806 fill = '0';
6807 break;
6808
6809 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006810 pbuf = formatbuf;
6811 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812 if (len < 0)
6813 goto onError;
6814 break;
6815
6816 default:
6817 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006818 "unsupported format character '%c' (0x%x) "
6819 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00006820 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006821 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006822 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823 goto onError;
6824 }
6825 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006826 if (*pbuf == '-' || *pbuf == '+') {
6827 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828 len--;
6829 }
6830 else if (flags & F_SIGN)
6831 sign = '+';
6832 else if (flags & F_BLANK)
6833 sign = ' ';
6834 else
6835 sign = 0;
6836 }
6837 if (width < len)
6838 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006839 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840 reslen -= rescnt;
6841 rescnt = width + fmtcnt + 100;
6842 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006843 if (reslen < 0) {
6844 Py_DECREF(result);
6845 return PyErr_NoMemory();
6846 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006847 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848 return NULL;
6849 res = PyUnicode_AS_UNICODE(result)
6850 + reslen - rescnt;
6851 }
6852 if (sign) {
6853 if (fill != ' ')
6854 *res++ = sign;
6855 rescnt--;
6856 if (width > len)
6857 width--;
6858 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006859 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6860 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006861 assert(pbuf[1] == c);
6862 if (fill != ' ') {
6863 *res++ = *pbuf++;
6864 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00006865 }
Tim Petersfff53252001-04-12 18:38:48 +00006866 rescnt -= 2;
6867 width -= 2;
6868 if (width < 0)
6869 width = 0;
6870 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00006871 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872 if (width > len && !(flags & F_LJUST)) {
6873 do {
6874 --rescnt;
6875 *res++ = fill;
6876 } while (--width > len);
6877 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006878 if (fill == ' ') {
6879 if (sign)
6880 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00006881 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006882 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006883 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00006884 *res++ = *pbuf++;
6885 *res++ = *pbuf++;
6886 }
6887 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006888 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 res += len;
6890 rescnt -= len;
6891 while (--width >= len) {
6892 --rescnt;
6893 *res++ = ' ';
6894 }
6895 if (dict && (argidx < arglen) && c != '%') {
6896 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006897 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 goto onError;
6899 }
6900 Py_XDECREF(temp);
6901 } /* '%' */
6902 } /* until end */
6903 if (argidx < arglen && !dict) {
6904 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006905 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 goto onError;
6907 }
6908
6909 if (args_owned) {
6910 Py_DECREF(args);
6911 }
6912 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00006913 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006914 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915 return (PyObject *)result;
6916
6917 onError:
6918 Py_XDECREF(result);
6919 Py_DECREF(uformat);
6920 if (args_owned) {
6921 Py_DECREF(args);
6922 }
6923 return NULL;
6924}
6925
6926static PyBufferProcs unicode_as_buffer = {
6927 (getreadbufferproc) unicode_buffer_getreadbuf,
6928 (getwritebufferproc) unicode_buffer_getwritebuf,
6929 (getsegcountproc) unicode_buffer_getsegcount,
6930 (getcharbufferproc) unicode_buffer_getcharbuf,
6931};
6932
Jeremy Hylton938ace62002-07-17 16:30:39 +00006933static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00006934unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6935
Tim Peters6d6c1a32001-08-02 04:15:00 +00006936static PyObject *
6937unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6938{
6939 PyObject *x = NULL;
6940 static char *kwlist[] = {"string", "encoding", "errors", 0};
6941 char *encoding = NULL;
6942 char *errors = NULL;
6943
Guido van Rossume023fe02001-08-30 03:12:59 +00006944 if (type != &PyUnicode_Type)
6945 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00006946 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6947 kwlist, &x, &encoding, &errors))
6948 return NULL;
6949 if (x == NULL)
6950 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00006951 if (encoding == NULL && errors == NULL)
6952 return PyObject_Unicode(x);
6953 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00006954 return PyUnicode_FromEncodedObject(x, encoding, errors);
6955}
6956
Guido van Rossume023fe02001-08-30 03:12:59 +00006957static PyObject *
6958unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6959{
Tim Petersaf90b3e2001-09-12 05:18:58 +00006960 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006961 int n;
6962
6963 assert(PyType_IsSubtype(type, &PyUnicode_Type));
6964 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6965 if (tmp == NULL)
6966 return NULL;
6967 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00006968 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006969 if (pnew == NULL) {
6970 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00006971 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00006972 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006973 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6974 if (pnew->str == NULL) {
6975 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006976 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006977 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00006978 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00006979 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006980 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6981 pnew->length = n;
6982 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00006983 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00006984 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006985}
6986
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006987PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00006988"unicode(string [, encoding[, errors]]) -> object\n\
6989\n\
6990Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00006991encoding defaults to the current default string encoding.\n\
6992errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00006993
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994PyTypeObject PyUnicode_Type = {
6995 PyObject_HEAD_INIT(&PyType_Type)
6996 0, /* ob_size */
6997 "unicode", /* tp_name */
6998 sizeof(PyUnicodeObject), /* tp_size */
6999 0, /* tp_itemsize */
7000 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007001 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007003 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004 0, /* tp_setattr */
7005 (cmpfunc) unicode_compare, /* tp_compare */
7006 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007007 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007009 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010 (hashfunc) unicode_hash, /* tp_hash*/
7011 0, /* tp_call*/
7012 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007013 PyObject_GenericGetAttr, /* tp_getattro */
7014 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007016 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7017 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007018 unicode_doc, /* tp_doc */
7019 0, /* tp_traverse */
7020 0, /* tp_clear */
7021 0, /* tp_richcompare */
7022 0, /* tp_weaklistoffset */
7023 0, /* tp_iter */
7024 0, /* tp_iternext */
7025 unicode_methods, /* tp_methods */
7026 0, /* tp_members */
7027 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007028 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007029 0, /* tp_dict */
7030 0, /* tp_descr_get */
7031 0, /* tp_descr_set */
7032 0, /* tp_dictoffset */
7033 0, /* tp_init */
7034 0, /* tp_alloc */
7035 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007036 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037};
7038
7039/* Initialize the Unicode implementation */
7040
Thomas Wouters78890102000-07-22 19:25:51 +00007041void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007043 int i;
7044
Fred Drakee4315f52000-05-09 19:53:39 +00007045 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007046 unicode_freelist = NULL;
7047 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007049 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007050 for (i = 0; i < 256; i++)
7051 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007052 if (PyType_Ready(&PyUnicode_Type) < 0)
7053 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054}
7055
7056/* Finalize the Unicode implementation */
7057
7058void
Thomas Wouters78890102000-07-22 19:25:51 +00007059_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007061 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007062 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007064 Py_XDECREF(unicode_empty);
7065 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007066
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007067 for (i = 0; i < 256; i++) {
7068 if (unicode_latin1[i]) {
7069 Py_DECREF(unicode_latin1[i]);
7070 unicode_latin1[i] = NULL;
7071 }
7072 }
7073
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007074 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075 PyUnicodeObject *v = u;
7076 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007077 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007078 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007079 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007080 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007082 unicode_freelist = NULL;
7083 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007085
7086/*
7087Local variables:
7088c-basic-offset: 4
7089indent-tabs-mode: nil
7090End:
7091*/