blob: 19d0353e8a8cdbf5d49dd3cc76368d2ebec090c4 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
157 unicode->length = length;
158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000266int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
371 for (i = size; i >= 0; i--)
372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
387 if (size > PyUnicode_GET_SIZE(unicode))
388 size = PyUnicode_GET_SIZE(unicode);
389#ifdef HAVE_USABLE_WCHAR_T
390 memcpy(w, unicode->str, size * sizeof(wchar_t));
391#else
392 {
393 register Py_UNICODE *u;
394 register int i;
395 u = PyUnicode_AS_UNICODE(unicode);
396 for (i = size; i >= 0; i--)
397 *w++ = *u++;
398 }
399#endif
400
401 return size;
402}
403
404#endif
405
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000406PyObject *PyUnicode_FromOrdinal(int ordinal)
407{
408 Py_UNICODE s[2];
409
410#ifdef Py_UNICODE_WIDE
411 if (ordinal < 0 || ordinal > 0x10ffff) {
412 PyErr_SetString(PyExc_ValueError,
413 "unichr() arg not in range(0x110000) "
414 "(wide Python build)");
415 return NULL;
416 }
417#else
418 if (ordinal < 0 || ordinal > 0xffff) {
419 PyErr_SetString(PyExc_ValueError,
420 "unichr() arg not in range(0x10000) "
421 "(narrow Python build)");
422 return NULL;
423 }
424#endif
425
426 if (ordinal <= 0xffff) {
427 /* UCS-2 character */
428 s[0] = (Py_UNICODE) ordinal;
429 return PyUnicode_FromUnicode(s, 1);
430 }
431 else {
432#ifndef Py_UNICODE_WIDE
433 /* UCS-4 character. store as two surrogate characters */
434 ordinal -= 0x10000L;
435 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
436 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
437 return PyUnicode_FromUnicode(s, 2);
438#else
439 s[0] = (Py_UNICODE)ordinal;
440 return PyUnicode_FromUnicode(s, 1);
441#endif
442 }
443}
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445PyObject *PyUnicode_FromObject(register PyObject *obj)
446{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000447 /* XXX Perhaps we should make this API an alias of
448 PyObject_Unicode() instead ?! */
449 if (PyUnicode_CheckExact(obj)) {
450 Py_INCREF(obj);
451 return obj;
452 }
453 if (PyUnicode_Check(obj)) {
454 /* For a Unicode subtype that's not a Unicode object,
455 return a true Unicode object with the same data. */
456 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
457 PyUnicode_GET_SIZE(obj));
458 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
460}
461
462PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
463 const char *encoding,
464 const char *errors)
465{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000466 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000468 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000469
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470 if (obj == NULL) {
471 PyErr_BadInternalCall();
472 return NULL;
473 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000474
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000475#if 0
476 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000477 that no encodings is given and then redirect to
478 PyObject_Unicode() which then applies the additional logic for
479 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000480
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000481 NOTE: This API should really only be used for object which
482 represent *encoded* Unicode !
483
484 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000485 if (PyUnicode_Check(obj)) {
486 if (encoding) {
487 PyErr_SetString(PyExc_TypeError,
488 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000489 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000490 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000492 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000493#else
494 if (PyUnicode_Check(obj)) {
495 PyErr_SetString(PyExc_TypeError,
496 "decoding Unicode is not supported");
497 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000498 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000499#endif
500
501 /* Coerce object */
502 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000503 s = PyString_AS_STRING(obj);
504 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000505 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000506 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
507 /* Overwrite the error message with something more useful in
508 case of a TypeError. */
509 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000511 "coercing to Unicode: need string or buffer, "
512 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000513 obj->ob_type->tp_name);
514 goto onError;
515 }
Tim Petersced69f82003-09-16 20:30:58 +0000516
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000517 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000518 if (len == 0) {
519 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000520 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000521 }
Tim Petersced69f82003-09-16 20:30:58 +0000522 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000523 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000524
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000525 return v;
526
527 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000528 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000529}
530
531PyObject *PyUnicode_Decode(const char *s,
532 int size,
533 const char *encoding,
534 const char *errors)
535{
536 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000537
538 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000539 encoding = PyUnicode_GetDefaultEncoding();
540
541 /* Shortcuts for common default encodings */
542 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000544 else if (strcmp(encoding, "latin-1") == 0)
545 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000546#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
547 else if (strcmp(encoding, "mbcs") == 0)
548 return PyUnicode_DecodeMBCS(s, size, errors);
549#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000550 else if (strcmp(encoding, "ascii") == 0)
551 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000552
553 /* Decode via the codec registry */
554 buffer = PyBuffer_FromMemory((void *)s, size);
555 if (buffer == NULL)
556 goto onError;
557 unicode = PyCodec_Decode(buffer, encoding, errors);
558 if (unicode == NULL)
559 goto onError;
560 if (!PyUnicode_Check(unicode)) {
561 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000562 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000563 unicode->ob_type->tp_name);
564 Py_DECREF(unicode);
565 goto onError;
566 }
567 Py_DECREF(buffer);
568 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000569
Guido van Rossumd57fd912000-03-10 22:53:23 +0000570 onError:
571 Py_XDECREF(buffer);
572 return NULL;
573}
574
575PyObject *PyUnicode_Encode(const Py_UNICODE *s,
576 int size,
577 const char *encoding,
578 const char *errors)
579{
580 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000581
Guido van Rossumd57fd912000-03-10 22:53:23 +0000582 unicode = PyUnicode_FromUnicode(s, size);
583 if (unicode == NULL)
584 return NULL;
585 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
586 Py_DECREF(unicode);
587 return v;
588}
589
590PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
591 const char *encoding,
592 const char *errors)
593{
594 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596 if (!PyUnicode_Check(unicode)) {
597 PyErr_BadArgument();
598 goto onError;
599 }
Fred Drakee4315f52000-05-09 19:53:39 +0000600
Tim Petersced69f82003-09-16 20:30:58 +0000601 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000602 encoding = PyUnicode_GetDefaultEncoding();
603
604 /* Shortcuts for common default encodings */
605 if (errors == NULL) {
606 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000607 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000608 else if (strcmp(encoding, "latin-1") == 0)
609 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000610#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
611 else if (strcmp(encoding, "mbcs") == 0)
612 return PyUnicode_AsMBCSString(unicode);
613#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000614 else if (strcmp(encoding, "ascii") == 0)
615 return PyUnicode_AsASCIIString(unicode);
616 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617
618 /* Encode via the codec registry */
619 v = PyCodec_Encode(unicode, encoding, errors);
620 if (v == NULL)
621 goto onError;
622 /* XXX Should we really enforce this ? */
623 if (!PyString_Check(v)) {
624 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000625 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 v->ob_type->tp_name);
627 Py_DECREF(v);
628 goto onError;
629 }
630 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000631
Guido van Rossumd57fd912000-03-10 22:53:23 +0000632 onError:
633 return NULL;
634}
635
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000636PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
637 const char *errors)
638{
639 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
640
641 if (v)
642 return v;
643 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
644 if (v && errors == NULL)
645 ((PyUnicodeObject *)unicode)->defenc = v;
646 return v;
647}
648
Guido van Rossumd57fd912000-03-10 22:53:23 +0000649Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
650{
651 if (!PyUnicode_Check(unicode)) {
652 PyErr_BadArgument();
653 goto onError;
654 }
655 return PyUnicode_AS_UNICODE(unicode);
656
657 onError:
658 return NULL;
659}
660
661int PyUnicode_GetSize(PyObject *unicode)
662{
663 if (!PyUnicode_Check(unicode)) {
664 PyErr_BadArgument();
665 goto onError;
666 }
667 return PyUnicode_GET_SIZE(unicode);
668
669 onError:
670 return -1;
671}
672
Thomas Wouters78890102000-07-22 19:25:51 +0000673const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000674{
675 return unicode_default_encoding;
676}
677
678int PyUnicode_SetDefaultEncoding(const char *encoding)
679{
680 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000681
Fred Drakee4315f52000-05-09 19:53:39 +0000682 /* Make sure the encoding is valid. As side effect, this also
683 loads the encoding into the codec registry cache. */
684 v = _PyCodec_Lookup(encoding);
685 if (v == NULL)
686 goto onError;
687 Py_DECREF(v);
688 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000689 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000690 sizeof(unicode_default_encoding));
691 return 0;
692
693 onError:
694 return -1;
695}
696
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000697/* error handling callback helper:
698 build arguments, call the callback and check the arguments,
699 if no exception occured, copy the replacement to the output
700 and adjust various state variables.
701 return 0 on success, -1 on error
702*/
703
704static
705int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
706 const char *encoding, const char *reason,
707 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
708 PyObject **output, int *outpos, Py_UNICODE **outptr)
709{
710 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
711
712 PyObject *restuple = NULL;
713 PyObject *repunicode = NULL;
714 int outsize = PyUnicode_GET_SIZE(*output);
715 int requiredsize;
716 int newpos;
717 Py_UNICODE *repptr;
718 int repsize;
719 int res = -1;
720
721 if (*errorHandler == NULL) {
722 *errorHandler = PyCodec_LookupError(errors);
723 if (*errorHandler == NULL)
724 goto onError;
725 }
726
727 if (*exceptionObject == NULL) {
728 *exceptionObject = PyUnicodeDecodeError_Create(
729 encoding, input, insize, *startinpos, *endinpos, reason);
730 if (*exceptionObject == NULL)
731 goto onError;
732 }
733 else {
734 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
735 goto onError;
736 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
737 goto onError;
738 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
739 goto onError;
740 }
741
742 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
743 if (restuple == NULL)
744 goto onError;
745 if (!PyTuple_Check(restuple)) {
746 PyErr_Format(PyExc_TypeError, &argparse[4]);
747 goto onError;
748 }
749 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
750 goto onError;
751 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000752 newpos = insize+newpos;
753 if (newpos<0 || newpos>insize) {
754 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
755 goto onError;
756 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000757
758 /* need more space? (at least enough for what we
759 have+the replacement+the rest of the string (starting
760 at the new input position), so we won't have to check space
761 when there are no errors in the rest of the string) */
762 repptr = PyUnicode_AS_UNICODE(repunicode);
763 repsize = PyUnicode_GET_SIZE(repunicode);
764 requiredsize = *outpos + repsize + insize-newpos;
765 if (requiredsize > outsize) {
766 if (requiredsize<2*outsize)
767 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000768 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000769 goto onError;
770 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
771 }
772 *endinpos = newpos;
773 *inptr = input + newpos;
774 Py_UNICODE_COPY(*outptr, repptr, repsize);
775 *outptr += repsize;
776 *outpos += repsize;
777 /* we made it! */
778 res = 0;
779
780 onError:
781 Py_XDECREF(restuple);
782 return res;
783}
784
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000785/* --- UTF-7 Codec -------------------------------------------------------- */
786
787/* see RFC2152 for details */
788
Tim Petersced69f82003-09-16 20:30:58 +0000789static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000790char utf7_special[128] = {
791 /* indicate whether a UTF-7 character is special i.e. cannot be directly
792 encoded:
793 0 - not special
794 1 - special
795 2 - whitespace (optional)
796 3 - RFC2152 Set O (optional) */
797 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
798 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
799 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
800 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
801 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
802 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
803 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
804 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
805
806};
807
808#define SPECIAL(c, encodeO, encodeWS) \
809 (((c)>127 || utf7_special[(c)] == 1) || \
810 (encodeWS && (utf7_special[(c)] == 2)) || \
811 (encodeO && (utf7_special[(c)] == 3)))
812
813#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
814#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
815#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
816 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
817
818#define ENCODE(out, ch, bits) \
819 while (bits >= 6) { \
820 *out++ = B64(ch >> (bits-6)); \
821 bits -= 6; \
822 }
823
824#define DECODE(out, ch, bits, surrogate) \
825 while (bits >= 16) { \
826 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
827 bits -= 16; \
828 if (surrogate) { \
829 /* We have already generated an error for the high surrogate
830 so let's not bother seeing if the low surrogate is correct or not */\
831 surrogate = 0; \
832 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
833 /* This is a surrogate pair. Unfortunately we can't represent \
834 it in a 16-bit character */ \
835 surrogate = 1; \
836 errmsg = "code pairs are not supported"; \
837 goto utf7Error; \
838 } else { \
839 *out++ = outCh; \
840 } \
841 } \
842
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000843PyObject *PyUnicode_DecodeUTF7(const char *s,
844 int size,
845 const char *errors)
846{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000847 const char *starts = s;
848 int startinpos;
849 int endinpos;
850 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000851 const char *e;
852 PyUnicodeObject *unicode;
853 Py_UNICODE *p;
854 const char *errmsg = "";
855 int inShift = 0;
856 unsigned int bitsleft = 0;
857 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000858 int surrogate = 0;
859 PyObject *errorHandler = NULL;
860 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000861
862 unicode = _PyUnicode_New(size);
863 if (!unicode)
864 return NULL;
865 if (size == 0)
866 return (PyObject *)unicode;
867
868 p = unicode->str;
869 e = s + size;
870
871 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000872 Py_UNICODE ch;
873 restart:
874 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000875
876 if (inShift) {
877 if ((ch == '-') || !B64CHAR(ch)) {
878 inShift = 0;
879 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000880
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
882 if (bitsleft >= 6) {
883 /* The shift sequence has a partial character in it. If
884 bitsleft < 6 then we could just classify it as padding
885 but that is not the case here */
886
887 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000888 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000889 }
890 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000891 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000892 here so indicate the potential of a misencoded character. */
893
894 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
895 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
896 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000897 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000898 }
899
900 if (ch == '-') {
901 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000902 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000903 inShift = 1;
904 }
905 } else if (SPECIAL(ch,0,0)) {
906 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000907 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000908 } else {
909 *p++ = ch;
910 }
911 } else {
912 charsleft = (charsleft << 6) | UB64(ch);
913 bitsleft += 6;
914 s++;
915 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
916 }
917 }
918 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000919 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000920 s++;
921 if (s < e && *s == '-') {
922 s++;
923 *p++ = '+';
924 } else
925 {
926 inShift = 1;
927 bitsleft = 0;
928 }
929 }
930 else if (SPECIAL(ch,0,0)) {
931 errmsg = "unexpected special character";
932 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000933 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000934 }
935 else {
936 *p++ = ch;
937 s++;
938 }
939 continue;
940 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000941 outpos = p-PyUnicode_AS_UNICODE(unicode);
942 endinpos = s-starts;
943 if (unicode_decode_call_errorhandler(
944 errors, &errorHandler,
945 "utf7", errmsg,
946 starts, size, &startinpos, &endinpos, &exc, &s,
947 (PyObject **)&unicode, &outpos, &p))
948 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 }
950
951 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000952 outpos = p-PyUnicode_AS_UNICODE(unicode);
953 endinpos = size;
954 if (unicode_decode_call_errorhandler(
955 errors, &errorHandler,
956 "utf7", "unterminated shift sequence",
957 starts, size, &startinpos, &endinpos, &exc, &s,
958 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000959 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000960 if (s < e)
961 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000962 }
963
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000964 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000965 goto onError;
966
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000967 Py_XDECREF(errorHandler);
968 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000969 return (PyObject *)unicode;
970
971onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000972 Py_XDECREF(errorHandler);
973 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974 Py_DECREF(unicode);
975 return NULL;
976}
977
978
979PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
980 int size,
981 int encodeSetO,
982 int encodeWhiteSpace,
983 const char *errors)
984{
985 PyObject *v;
986 /* It might be possible to tighten this worst case */
987 unsigned int cbAllocated = 5 * size;
988 int inShift = 0;
989 int i = 0;
990 unsigned int bitsleft = 0;
991 unsigned long charsleft = 0;
992 char * out;
993 char * start;
994
995 if (size == 0)
996 return PyString_FromStringAndSize(NULL, 0);
997
998 v = PyString_FromStringAndSize(NULL, cbAllocated);
999 if (v == NULL)
1000 return NULL;
1001
1002 start = out = PyString_AS_STRING(v);
1003 for (;i < size; ++i) {
1004 Py_UNICODE ch = s[i];
1005
1006 if (!inShift) {
1007 if (ch == '+') {
1008 *out++ = '+';
1009 *out++ = '-';
1010 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1011 charsleft = ch;
1012 bitsleft = 16;
1013 *out++ = '+';
1014 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1015 inShift = bitsleft > 0;
1016 } else {
1017 *out++ = (char) ch;
1018 }
1019 } else {
1020 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1021 *out++ = B64(charsleft << (6-bitsleft));
1022 charsleft = 0;
1023 bitsleft = 0;
1024 /* Characters not in the BASE64 set implicitly unshift the sequence
1025 so no '-' is required, except if the character is itself a '-' */
1026 if (B64CHAR(ch) || ch == '-') {
1027 *out++ = '-';
1028 }
1029 inShift = 0;
1030 *out++ = (char) ch;
1031 } else {
1032 bitsleft += 16;
1033 charsleft = (charsleft << 16) | ch;
1034 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1035
1036 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001037 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001038 or '-' then the shift sequence will be terminated implicitly and we
1039 don't have to insert a '-'. */
1040
1041 if (bitsleft == 0) {
1042 if (i + 1 < size) {
1043 Py_UNICODE ch2 = s[i+1];
1044
1045 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001046
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001047 } else if (B64CHAR(ch2) || ch2 == '-') {
1048 *out++ = '-';
1049 inShift = 0;
1050 } else {
1051 inShift = 0;
1052 }
1053
1054 }
1055 else {
1056 *out++ = '-';
1057 inShift = 0;
1058 }
1059 }
Tim Petersced69f82003-09-16 20:30:58 +00001060 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062 }
1063 if (bitsleft) {
1064 *out++= B64(charsleft << (6-bitsleft) );
1065 *out++ = '-';
1066 }
1067
Tim Peters5de98422002-04-27 18:44:32 +00001068 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001069 return v;
1070}
1071
1072#undef SPECIAL
1073#undef B64
1074#undef B64CHAR
1075#undef UB64
1076#undef ENCODE
1077#undef DECODE
1078
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079/* --- UTF-8 Codec -------------------------------------------------------- */
1080
Tim Petersced69f82003-09-16 20:30:58 +00001081static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001082char utf8_code_length[256] = {
1083 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1084 illegal prefix. see RFC 2279 for details */
1085 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1086 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1087 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1088 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1089 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1090 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1091 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1092 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1093 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1094 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1095 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1096 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1097 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1098 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1099 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1100 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1101};
1102
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103PyObject *PyUnicode_DecodeUTF8(const char *s,
1104 int size,
1105 const char *errors)
1106{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001107 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001109 int startinpos;
1110 int endinpos;
1111 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 const char *e;
1113 PyUnicodeObject *unicode;
1114 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001115 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001116 PyObject *errorHandler = NULL;
1117 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001118
1119 /* Note: size will always be longer than the resulting Unicode
1120 character count */
1121 unicode = _PyUnicode_New(size);
1122 if (!unicode)
1123 return NULL;
1124 if (size == 0)
1125 return (PyObject *)unicode;
1126
1127 /* Unpack UTF-8 encoded data */
1128 p = unicode->str;
1129 e = s + size;
1130
1131 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001132 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001133
1134 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001135 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001136 s++;
1137 continue;
1138 }
1139
1140 n = utf8_code_length[ch];
1141
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001142 if (s + n > e) {
1143 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001144 startinpos = s-starts;
1145 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001146 goto utf8Error;
1147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148
1149 switch (n) {
1150
1151 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001152 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001153 startinpos = s-starts;
1154 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001155 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156
1157 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001158 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001159 startinpos = s-starts;
1160 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001161 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162
1163 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001164 if ((s[1] & 0xc0) != 0x80) {
1165 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001166 startinpos = s-starts;
1167 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001168 goto utf8Error;
1169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001171 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001172 startinpos = s-starts;
1173 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001174 errmsg = "illegal encoding";
1175 goto utf8Error;
1176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001178 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 break;
1180
1181 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001182 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001183 (s[2] & 0xc0) != 0x80) {
1184 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001185 startinpos = s-starts;
1186 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001187 goto utf8Error;
1188 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001190 if (ch < 0x0800) {
1191 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001192 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001193
1194 XXX For wide builds (UCS-4) we should probably try
1195 to recombine the surrogates into a single code
1196 unit.
1197 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001198 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001199 startinpos = s-starts;
1200 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001201 goto utf8Error;
1202 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001204 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001205 break;
1206
1207 case 4:
1208 if ((s[1] & 0xc0) != 0x80 ||
1209 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001210 (s[3] & 0xc0) != 0x80) {
1211 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001212 startinpos = s-starts;
1213 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001214 goto utf8Error;
1215 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001216 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1217 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1218 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001219 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001220 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001221 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001222 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001223 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001224 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001225 startinpos = s-starts;
1226 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001227 goto utf8Error;
1228 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001229#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001230 *p++ = (Py_UNICODE)ch;
1231#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001232 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001233
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001234 /* translate from 10000..10FFFF to 0..FFFF */
1235 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001236
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001237 /* high surrogate = top 10 bits added to D800 */
1238 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001239
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001240 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001241 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001242#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 break;
1244
1245 default:
1246 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001247 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001248 startinpos = s-starts;
1249 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001250 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 }
1252 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001253 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001254
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001255 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001256 outpos = p-PyUnicode_AS_UNICODE(unicode);
1257 if (unicode_decode_call_errorhandler(
1258 errors, &errorHandler,
1259 "utf8", errmsg,
1260 starts, size, &startinpos, &endinpos, &exc, &s,
1261 (PyObject **)&unicode, &outpos, &p))
1262 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001263 }
1264
1265 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001266 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267 goto onError;
1268
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001269 Py_XDECREF(errorHandler);
1270 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001271 return (PyObject *)unicode;
1272
1273onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001274 Py_XDECREF(errorHandler);
1275 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 Py_DECREF(unicode);
1277 return NULL;
1278}
1279
Tim Peters602f7402002-04-27 18:03:26 +00001280/* Allocation strategy: if the string is short, convert into a stack buffer
1281 and allocate exactly as much space needed at the end. Else allocate the
1282 maximum possible needed (4 result bytes per Unicode character), and return
1283 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001284*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001285PyObject *
1286PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1287 int size,
1288 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289{
Tim Peters602f7402002-04-27 18:03:26 +00001290#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001291
Tim Peters602f7402002-04-27 18:03:26 +00001292 int i; /* index into s of next input byte */
1293 PyObject *v; /* result string object */
1294 char *p; /* next free byte in output buffer */
1295 int nallocated; /* number of result bytes allocated */
1296 int nneeded; /* number of result bytes needed */
1297 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001298
Tim Peters602f7402002-04-27 18:03:26 +00001299 assert(s != NULL);
1300 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001301
Tim Peters602f7402002-04-27 18:03:26 +00001302 if (size <= MAX_SHORT_UNICHARS) {
1303 /* Write into the stack buffer; nallocated can't overflow.
1304 * At the end, we'll allocate exactly as much heap space as it
1305 * turns out we need.
1306 */
1307 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1308 v = NULL; /* will allocate after we're done */
1309 p = stackbuf;
1310 }
1311 else {
1312 /* Overallocate on the heap, and give the excess back at the end. */
1313 nallocated = size * 4;
1314 if (nallocated / 4 != size) /* overflow! */
1315 return PyErr_NoMemory();
1316 v = PyString_FromStringAndSize(NULL, nallocated);
1317 if (v == NULL)
1318 return NULL;
1319 p = PyString_AS_STRING(v);
1320 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001321
Tim Peters602f7402002-04-27 18:03:26 +00001322 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001323 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001324
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001325 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001326 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001328
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001330 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001331 *p++ = (char)(0xc0 | (ch >> 6));
1332 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001333 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001334 else {
Tim Peters602f7402002-04-27 18:03:26 +00001335 /* Encode UCS2 Unicode ordinals */
1336 if (ch < 0x10000) {
1337 /* Special case: check for high surrogate */
1338 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1339 Py_UCS4 ch2 = s[i];
1340 /* Check for low surrogate and combine the two to
1341 form a UCS4 value */
1342 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001343 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001344 i++;
1345 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001346 }
Tim Peters602f7402002-04-27 18:03:26 +00001347 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001349 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001350 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1351 *p++ = (char)(0x80 | (ch & 0x3f));
1352 continue;
1353 }
1354encodeUCS4:
1355 /* Encode UCS4 Unicode ordinals */
1356 *p++ = (char)(0xf0 | (ch >> 18));
1357 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1358 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1359 *p++ = (char)(0x80 | (ch & 0x3f));
1360 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001362
Tim Peters602f7402002-04-27 18:03:26 +00001363 if (v == NULL) {
1364 /* This was stack allocated. */
1365 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1366 assert(nneeded <= nallocated);
1367 v = PyString_FromStringAndSize(stackbuf, nneeded);
1368 }
1369 else {
1370 /* Cut back to size actually needed. */
1371 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1372 assert(nneeded <= nallocated);
1373 _PyString_Resize(&v, nneeded);
1374 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001375 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001376
Tim Peters602f7402002-04-27 18:03:26 +00001377#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378}
1379
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1381{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 if (!PyUnicode_Check(unicode)) {
1383 PyErr_BadArgument();
1384 return NULL;
1385 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001386 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1387 PyUnicode_GET_SIZE(unicode),
1388 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389}
1390
1391/* --- UTF-16 Codec ------------------------------------------------------- */
1392
Tim Peters772747b2001-08-09 22:21:55 +00001393PyObject *
1394PyUnicode_DecodeUTF16(const char *s,
1395 int size,
1396 const char *errors,
1397 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001399 const char *starts = s;
1400 int startinpos;
1401 int endinpos;
1402 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001403 PyUnicodeObject *unicode;
1404 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001405 const unsigned char *q, *e;
1406 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001407 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001408 /* Offsets from q for retrieving byte pairs in the right order. */
1409#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1410 int ihi = 1, ilo = 0;
1411#else
1412 int ihi = 0, ilo = 1;
1413#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414 PyObject *errorHandler = NULL;
1415 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001416
1417 /* Note: size will always be longer than the resulting Unicode
1418 character count */
1419 unicode = _PyUnicode_New(size);
1420 if (!unicode)
1421 return NULL;
1422 if (size == 0)
1423 return (PyObject *)unicode;
1424
1425 /* Unpack UTF-16 encoded data */
1426 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001427 q = (unsigned char *)s;
1428 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429
1430 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001431 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001433 /* Check for BOM marks (U+FEFF) in the input and adjust current
1434 byte order setting accordingly. In native mode, the leading BOM
1435 mark is skipped, in all other modes, it is copied to the output
1436 stream as-is (giving a ZWNBSP character). */
1437 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001438 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001439#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001440 if (bom == 0xFEFF) {
1441 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001442 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001443 }
1444 else if (bom == 0xFFFE) {
1445 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001446 bo = 1;
1447 }
Tim Petersced69f82003-09-16 20:30:58 +00001448#else
Tim Peters772747b2001-08-09 22:21:55 +00001449 if (bom == 0xFEFF) {
1450 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001451 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001452 }
1453 else if (bom == 0xFFFE) {
1454 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001455 bo = -1;
1456 }
1457#endif
1458 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459
Tim Peters772747b2001-08-09 22:21:55 +00001460 if (bo == -1) {
1461 /* force LE */
1462 ihi = 1;
1463 ilo = 0;
1464 }
1465 else if (bo == 1) {
1466 /* force BE */
1467 ihi = 0;
1468 ilo = 1;
1469 }
1470
1471 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001472 Py_UNICODE ch;
1473 /* remaing bytes at the end? (size should be even) */
1474 if (e-q<2) {
1475 errmsg = "truncated data";
1476 startinpos = ((const char *)q)-starts;
1477 endinpos = ((const char *)e)-starts;
1478 goto utf16Error;
1479 /* The remaining input chars are ignored if the callback
1480 chooses to skip the input */
1481 }
1482 ch = (q[ihi] << 8) | q[ilo];
1483
Tim Peters772747b2001-08-09 22:21:55 +00001484 q += 2;
1485
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486 if (ch < 0xD800 || ch > 0xDFFF) {
1487 *p++ = ch;
1488 continue;
1489 }
1490
1491 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001492 if (q >= e) {
1493 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001494 startinpos = (((const char *)q)-2)-starts;
1495 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001496 goto utf16Error;
1497 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001498 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001499 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1500 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001501 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001502#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001503 *p++ = ch;
1504 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001505#else
1506 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001507#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001508 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001509 }
1510 else {
1511 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001512 startinpos = (((const char *)q)-4)-starts;
1513 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001514 goto utf16Error;
1515 }
1516
Guido van Rossumd57fd912000-03-10 22:53:23 +00001517 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001518 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001519 startinpos = (((const char *)q)-2)-starts;
1520 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001521 /* Fall through to report the error */
1522
1523 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001524 outpos = p-PyUnicode_AS_UNICODE(unicode);
1525 if (unicode_decode_call_errorhandler(
1526 errors, &errorHandler,
1527 "utf16", errmsg,
1528 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1529 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001530 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001531 }
1532
1533 if (byteorder)
1534 *byteorder = bo;
1535
1536 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001537 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001538 goto onError;
1539
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 Py_XDECREF(errorHandler);
1541 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 return (PyObject *)unicode;
1543
1544onError:
1545 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001546 Py_XDECREF(errorHandler);
1547 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001548 return NULL;
1549}
1550
Tim Peters772747b2001-08-09 22:21:55 +00001551PyObject *
1552PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1553 int size,
1554 const char *errors,
1555 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001556{
1557 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001558 unsigned char *p;
1559 int i, pairs;
1560 /* Offsets from p for storing byte pairs in the right order. */
1561#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1562 int ihi = 1, ilo = 0;
1563#else
1564 int ihi = 0, ilo = 1;
1565#endif
1566
1567#define STORECHAR(CH) \
1568 do { \
1569 p[ihi] = ((CH) >> 8) & 0xff; \
1570 p[ilo] = (CH) & 0xff; \
1571 p += 2; \
1572 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001573
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001574 for (i = pairs = 0; i < size; i++)
1575 if (s[i] >= 0x10000)
1576 pairs++;
Tim Petersced69f82003-09-16 20:30:58 +00001577 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001578 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001579 if (v == NULL)
1580 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001581
Tim Peters772747b2001-08-09 22:21:55 +00001582 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001583 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001584 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001585 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001586 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001587
1588 if (byteorder == -1) {
1589 /* force LE */
1590 ihi = 1;
1591 ilo = 0;
1592 }
1593 else if (byteorder == 1) {
1594 /* force BE */
1595 ihi = 0;
1596 ilo = 1;
1597 }
1598
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001599 while (size-- > 0) {
1600 Py_UNICODE ch = *s++;
1601 Py_UNICODE ch2 = 0;
1602 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001603 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1604 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001605 }
Tim Peters772747b2001-08-09 22:21:55 +00001606 STORECHAR(ch);
1607 if (ch2)
1608 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001610 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001611#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001612}
1613
1614PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1615{
1616 if (!PyUnicode_Check(unicode)) {
1617 PyErr_BadArgument();
1618 return NULL;
1619 }
1620 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1621 PyUnicode_GET_SIZE(unicode),
1622 NULL,
1623 0);
1624}
1625
1626/* --- Unicode Escape Codec ----------------------------------------------- */
1627
Fredrik Lundh06d12682001-01-24 07:59:11 +00001628static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001629
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1631 int size,
1632 const char *errors)
1633{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001634 const char *starts = s;
1635 int startinpos;
1636 int endinpos;
1637 int outpos;
1638 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001639 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001640 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001641 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001642 char* message;
1643 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001644 PyObject *errorHandler = NULL;
1645 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001646
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 /* Escaped strings will always be longer than the resulting
1648 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 length after conversion to the true value.
1650 (but if the error callback returns a long replacement string
1651 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652 v = _PyUnicode_New(size);
1653 if (v == NULL)
1654 goto onError;
1655 if (size == 0)
1656 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001657
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001658 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001659 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001660
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 while (s < end) {
1662 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001663 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001664 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665
1666 /* Non-escape characters are interpreted as Unicode ordinals */
1667 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001668 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 continue;
1670 }
1671
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001672 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 /* \ - Escapes */
1674 s++;
1675 switch (*s++) {
1676
1677 /* \x escapes */
1678 case '\n': break;
1679 case '\\': *p++ = '\\'; break;
1680 case '\'': *p++ = '\''; break;
1681 case '\"': *p++ = '\"'; break;
1682 case 'b': *p++ = '\b'; break;
1683 case 'f': *p++ = '\014'; break; /* FF */
1684 case 't': *p++ = '\t'; break;
1685 case 'n': *p++ = '\n'; break;
1686 case 'r': *p++ = '\r'; break;
1687 case 'v': *p++ = '\013'; break; /* VT */
1688 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1689
1690 /* \OOO (octal) escapes */
1691 case '0': case '1': case '2': case '3':
1692 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001693 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001695 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001697 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001699 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700 break;
1701
Fredrik Lundhccc74732001-02-18 22:13:49 +00001702 /* hex escapes */
1703 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001705 digits = 2;
1706 message = "truncated \\xXX escape";
1707 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708
Fredrik Lundhccc74732001-02-18 22:13:49 +00001709 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001711 digits = 4;
1712 message = "truncated \\uXXXX escape";
1713 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714
Fredrik Lundhccc74732001-02-18 22:13:49 +00001715 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001716 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001717 digits = 8;
1718 message = "truncated \\UXXXXXXXX escape";
1719 hexescape:
1720 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001721 outpos = p-PyUnicode_AS_UNICODE(v);
1722 if (s+digits>end) {
1723 endinpos = size;
1724 if (unicode_decode_call_errorhandler(
1725 errors, &errorHandler,
1726 "unicodeescape", "end of string in escape sequence",
1727 starts, size, &startinpos, &endinpos, &exc, &s,
1728 (PyObject **)&v, &outpos, &p))
1729 goto onError;
1730 goto nextByte;
1731 }
1732 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001733 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001734 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001735 endinpos = (s+i+1)-starts;
1736 if (unicode_decode_call_errorhandler(
1737 errors, &errorHandler,
1738 "unicodeescape", message,
1739 starts, size, &startinpos, &endinpos, &exc, &s,
1740 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001741 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001743 }
1744 chr = (chr<<4) & ~0xF;
1745 if (c >= '0' && c <= '9')
1746 chr += c - '0';
1747 else if (c >= 'a' && c <= 'f')
1748 chr += 10 + c - 'a';
1749 else
1750 chr += 10 + c - 'A';
1751 }
1752 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001753 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001754 /* _decoding_error will have already written into the
1755 target buffer. */
1756 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001757 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001758 /* when we get here, chr is a 32-bit unicode character */
1759 if (chr <= 0xffff)
1760 /* UCS-2 character */
1761 *p++ = (Py_UNICODE) chr;
1762 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001763 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001764 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001765#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001766 *p++ = chr;
1767#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001768 chr -= 0x10000L;
1769 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001770 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001771#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001772 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001773 endinpos = s-starts;
1774 outpos = p-PyUnicode_AS_UNICODE(v);
1775 if (unicode_decode_call_errorhandler(
1776 errors, &errorHandler,
1777 "unicodeescape", "illegal Unicode character",
1778 starts, size, &startinpos, &endinpos, &exc, &s,
1779 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001780 goto onError;
1781 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001782 break;
1783
1784 /* \N{name} */
1785 case 'N':
1786 message = "malformed \\N character escape";
1787 if (ucnhash_CAPI == NULL) {
1788 /* load the unicode data module */
1789 PyObject *m, *v;
1790 m = PyImport_ImportModule("unicodedata");
1791 if (m == NULL)
1792 goto ucnhashError;
1793 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1794 Py_DECREF(m);
1795 if (v == NULL)
1796 goto ucnhashError;
1797 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1798 Py_DECREF(v);
1799 if (ucnhash_CAPI == NULL)
1800 goto ucnhashError;
1801 }
1802 if (*s == '{') {
1803 const char *start = s+1;
1804 /* look for the closing brace */
1805 while (*s != '}' && s < end)
1806 s++;
1807 if (s > start && s < end && *s == '}') {
1808 /* found a name. look it up in the unicode database */
1809 message = "unknown Unicode character name";
1810 s++;
1811 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1812 goto store;
1813 }
1814 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001815 endinpos = s-starts;
1816 outpos = p-PyUnicode_AS_UNICODE(v);
1817 if (unicode_decode_call_errorhandler(
1818 errors, &errorHandler,
1819 "unicodeescape", message,
1820 starts, size, &startinpos, &endinpos, &exc, &s,
1821 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001822 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001823 break;
1824
1825 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001826 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001827 message = "\\ at end of string";
1828 s--;
1829 endinpos = s-starts;
1830 outpos = p-PyUnicode_AS_UNICODE(v);
1831 if (unicode_decode_call_errorhandler(
1832 errors, &errorHandler,
1833 "unicodeescape", message,
1834 starts, size, &startinpos, &endinpos, &exc, &s,
1835 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001836 goto onError;
1837 }
1838 else {
1839 *p++ = '\\';
1840 *p++ = (unsigned char)s[-1];
1841 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001842 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001843 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001844 nextByte:
1845 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001847 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001848 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001849 Py_XDECREF(errorHandler);
1850 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001852
Fredrik Lundhccc74732001-02-18 22:13:49 +00001853ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001854 PyErr_SetString(
1855 PyExc_UnicodeError,
1856 "\\N escapes not supported (can't load unicodedata module)"
1857 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 Py_XDECREF(errorHandler);
1859 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001860 return NULL;
1861
Fredrik Lundhccc74732001-02-18 22:13:49 +00001862onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001863 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001864 Py_XDECREF(errorHandler);
1865 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866 return NULL;
1867}
1868
1869/* Return a Unicode-Escape string version of the Unicode object.
1870
1871 If quotes is true, the string is enclosed in u"" or u'' quotes as
1872 appropriate.
1873
1874*/
1875
Barry Warsaw51ac5802000-03-20 16:36:48 +00001876static const Py_UNICODE *findchar(const Py_UNICODE *s,
1877 int size,
1878 Py_UNICODE ch);
1879
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880static
1881PyObject *unicodeescape_string(const Py_UNICODE *s,
1882 int size,
1883 int quotes)
1884{
1885 PyObject *repr;
1886 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001887
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001888 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889
1890 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1891 if (repr == NULL)
1892 return NULL;
1893
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001894 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895
1896 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001897 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001898 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 !findchar(s, size, '"')) ? '"' : '\'';
1900 }
1901 while (size-- > 0) {
1902 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001903
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001905 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001906 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001907 *p++ = '\\';
1908 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001909 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001910 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001911
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001912#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001913 /* Map 21-bit characters to '\U00xxxxxx' */
1914 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001915 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00001916
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001917 /* Resize the string if necessary */
1918 if (offset + 12 > PyString_GET_SIZE(repr)) {
1919 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001920 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001921 p = PyString_AS_STRING(repr) + offset;
1922 }
1923
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001924 *p++ = '\\';
1925 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001926 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1927 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1928 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1929 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1930 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1931 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1932 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001933 *p++ = hexdigit[ch & 0x0000000F];
1934 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001935 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001936#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001937 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1938 else if (ch >= 0xD800 && ch < 0xDC00) {
1939 Py_UNICODE ch2;
1940 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00001941
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001942 ch2 = *s++;
1943 size--;
1944 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1945 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1946 *p++ = '\\';
1947 *p++ = 'U';
1948 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1949 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1950 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1951 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1952 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1953 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1954 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1955 *p++ = hexdigit[ucs & 0x0000000F];
1956 continue;
1957 }
1958 /* Fall through: isolated surrogates are copied as-is */
1959 s--;
1960 size++;
1961 }
1962
Guido van Rossumd57fd912000-03-10 22:53:23 +00001963 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001964 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001965 *p++ = '\\';
1966 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001967 *p++ = hexdigit[(ch >> 12) & 0x000F];
1968 *p++ = hexdigit[(ch >> 8) & 0x000F];
1969 *p++ = hexdigit[(ch >> 4) & 0x000F];
1970 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001972
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001973 /* Map special whitespace to '\t', \n', '\r' */
1974 else if (ch == '\t') {
1975 *p++ = '\\';
1976 *p++ = 't';
1977 }
1978 else if (ch == '\n') {
1979 *p++ = '\\';
1980 *p++ = 'n';
1981 }
1982 else if (ch == '\r') {
1983 *p++ = '\\';
1984 *p++ = 'r';
1985 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001986
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001987 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001988 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001990 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001991 *p++ = hexdigit[(ch >> 4) & 0x000F];
1992 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00001993 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001994
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995 /* Copy everything else as-is */
1996 else
1997 *p++ = (char) ch;
1998 }
1999 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002000 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001
2002 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002003 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002004 return repr;
2005}
2006
2007PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2008 int size)
2009{
2010 return unicodeescape_string(s, size, 0);
2011}
2012
2013PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2014{
2015 if (!PyUnicode_Check(unicode)) {
2016 PyErr_BadArgument();
2017 return NULL;
2018 }
2019 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2020 PyUnicode_GET_SIZE(unicode));
2021}
2022
2023/* --- Raw Unicode Escape Codec ------------------------------------------- */
2024
2025PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2026 int size,
2027 const char *errors)
2028{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002029 const char *starts = s;
2030 int startinpos;
2031 int endinpos;
2032 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002034 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 const char *end;
2036 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002037 PyObject *errorHandler = NULL;
2038 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002039
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040 /* Escaped strings will always be longer than the resulting
2041 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002042 length after conversion to the true value. (But decoding error
2043 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 v = _PyUnicode_New(size);
2045 if (v == NULL)
2046 goto onError;
2047 if (size == 0)
2048 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002049 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 end = s + size;
2051 while (s < end) {
2052 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002053 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002055 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056
2057 /* Non-escape characters are interpreted as Unicode ordinals */
2058 if (*s != '\\') {
2059 *p++ = (unsigned char)*s++;
2060 continue;
2061 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002062 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063
2064 /* \u-escapes are only interpreted iff the number of leading
2065 backslashes if odd */
2066 bs = s;
2067 for (;s < end;) {
2068 if (*s != '\\')
2069 break;
2070 *p++ = (unsigned char)*s++;
2071 }
2072 if (((s - bs) & 1) == 0 ||
2073 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002074 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075 continue;
2076 }
2077 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002078 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079 s++;
2080
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002081 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002082 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002083 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002084 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002086 endinpos = s-starts;
2087 if (unicode_decode_call_errorhandler(
2088 errors, &errorHandler,
2089 "rawunicodeescape", "truncated \\uXXXX",
2090 starts, size, &startinpos, &endinpos, &exc, &s,
2091 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002093 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 }
2095 x = (x<<4) & ~0xF;
2096 if (c >= '0' && c <= '9')
2097 x += c - '0';
2098 else if (c >= 'a' && c <= 'f')
2099 x += 10 + c - 'a';
2100 else
2101 x += 10 + c - 'A';
2102 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002103#ifndef Py_UNICODE_WIDE
2104 if (x > 0x10000) {
2105 if (unicode_decode_call_errorhandler(
2106 errors, &errorHandler,
2107 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2108 starts, size, &startinpos, &endinpos, &exc, &s,
2109 (PyObject **)&v, &outpos, &p))
2110 goto onError;
2111 }
2112#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002113 *p++ = x;
2114 nextByte:
2115 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002117 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002118 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002119 Py_XDECREF(errorHandler);
2120 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002122
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 onError:
2124 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002125 Py_XDECREF(errorHandler);
2126 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127 return NULL;
2128}
2129
2130PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2131 int size)
2132{
2133 PyObject *repr;
2134 char *p;
2135 char *q;
2136
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002137 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002138
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002139#ifdef Py_UNICODE_WIDE
2140 repr = PyString_FromStringAndSize(NULL, 10 * size);
2141#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002143#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 if (repr == NULL)
2145 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002146 if (size == 0)
2147 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148
2149 p = q = PyString_AS_STRING(repr);
2150 while (size-- > 0) {
2151 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002152#ifdef Py_UNICODE_WIDE
2153 /* Map 32-bit characters to '\Uxxxxxxxx' */
2154 if (ch >= 0x10000) {
2155 *p++ = '\\';
2156 *p++ = 'U';
2157 *p++ = hexdigit[(ch >> 28) & 0xf];
2158 *p++ = hexdigit[(ch >> 24) & 0xf];
2159 *p++ = hexdigit[(ch >> 20) & 0xf];
2160 *p++ = hexdigit[(ch >> 16) & 0xf];
2161 *p++ = hexdigit[(ch >> 12) & 0xf];
2162 *p++ = hexdigit[(ch >> 8) & 0xf];
2163 *p++ = hexdigit[(ch >> 4) & 0xf];
2164 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002165 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002166 else
2167#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168 /* Map 16-bit characters to '\uxxxx' */
2169 if (ch >= 256) {
2170 *p++ = '\\';
2171 *p++ = 'u';
2172 *p++ = hexdigit[(ch >> 12) & 0xf];
2173 *p++ = hexdigit[(ch >> 8) & 0xf];
2174 *p++ = hexdigit[(ch >> 4) & 0xf];
2175 *p++ = hexdigit[ch & 15];
2176 }
2177 /* Copy everything else as-is */
2178 else
2179 *p++ = (char) ch;
2180 }
2181 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002182 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 return repr;
2184}
2185
2186PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2187{
2188 if (!PyUnicode_Check(unicode)) {
2189 PyErr_BadArgument();
2190 return NULL;
2191 }
2192 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2193 PyUnicode_GET_SIZE(unicode));
2194}
2195
2196/* --- Latin-1 Codec ------------------------------------------------------ */
2197
2198PyObject *PyUnicode_DecodeLatin1(const char *s,
2199 int size,
2200 const char *errors)
2201{
2202 PyUnicodeObject *v;
2203 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002204
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002206 if (size == 1 && *(unsigned char*)s < 256) {
2207 Py_UNICODE r = *(unsigned char*)s;
2208 return PyUnicode_FromUnicode(&r, 1);
2209 }
2210
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 v = _PyUnicode_New(size);
2212 if (v == NULL)
2213 goto onError;
2214 if (size == 0)
2215 return (PyObject *)v;
2216 p = PyUnicode_AS_UNICODE(v);
2217 while (size-- > 0)
2218 *p++ = (unsigned char)*s++;
2219 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002220
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221 onError:
2222 Py_XDECREF(v);
2223 return NULL;
2224}
2225
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002226/* create or adjust a UnicodeEncodeError */
2227static void make_encode_exception(PyObject **exceptionObject,
2228 const char *encoding,
2229 const Py_UNICODE *unicode, int size,
2230 int startpos, int endpos,
2231 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002233 if (*exceptionObject == NULL) {
2234 *exceptionObject = PyUnicodeEncodeError_Create(
2235 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236 }
2237 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002238 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2239 goto onError;
2240 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2241 goto onError;
2242 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2243 goto onError;
2244 return;
2245 onError:
2246 Py_DECREF(*exceptionObject);
2247 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 }
2249}
2250
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002251/* raises a UnicodeEncodeError */
2252static void raise_encode_exception(PyObject **exceptionObject,
2253 const char *encoding,
2254 const Py_UNICODE *unicode, int size,
2255 int startpos, int endpos,
2256 const char *reason)
2257{
2258 make_encode_exception(exceptionObject,
2259 encoding, unicode, size, startpos, endpos, reason);
2260 if (*exceptionObject != NULL)
2261 PyCodec_StrictErrors(*exceptionObject);
2262}
2263
2264/* error handling callback helper:
2265 build arguments, call the callback and check the arguments,
2266 put the result into newpos and return the replacement string, which
2267 has to be freed by the caller */
2268static PyObject *unicode_encode_call_errorhandler(const char *errors,
2269 PyObject **errorHandler,
2270 const char *encoding, const char *reason,
2271 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2272 int startpos, int endpos,
2273 int *newpos)
2274{
2275 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2276
2277 PyObject *restuple;
2278 PyObject *resunicode;
2279
2280 if (*errorHandler == NULL) {
2281 *errorHandler = PyCodec_LookupError(errors);
2282 if (*errorHandler == NULL)
2283 return NULL;
2284 }
2285
2286 make_encode_exception(exceptionObject,
2287 encoding, unicode, size, startpos, endpos, reason);
2288 if (*exceptionObject == NULL)
2289 return NULL;
2290
2291 restuple = PyObject_CallFunctionObjArgs(
2292 *errorHandler, *exceptionObject, NULL);
2293 if (restuple == NULL)
2294 return NULL;
2295 if (!PyTuple_Check(restuple)) {
2296 PyErr_Format(PyExc_TypeError, &argparse[4]);
2297 Py_DECREF(restuple);
2298 return NULL;
2299 }
2300 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2301 &resunicode, newpos)) {
2302 Py_DECREF(restuple);
2303 return NULL;
2304 }
2305 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002306 *newpos = size+*newpos;
2307 if (*newpos<0 || *newpos>size) {
2308 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2309 Py_DECREF(restuple);
2310 return NULL;
2311 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002312 Py_INCREF(resunicode);
2313 Py_DECREF(restuple);
2314 return resunicode;
2315}
2316
2317static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2318 int size,
2319 const char *errors,
2320 int limit)
2321{
2322 /* output object */
2323 PyObject *res;
2324 /* pointers to the beginning and end+1 of input */
2325 const Py_UNICODE *startp = p;
2326 const Py_UNICODE *endp = p + size;
2327 /* pointer to the beginning of the unencodable characters */
2328 /* const Py_UNICODE *badp = NULL; */
2329 /* pointer into the output */
2330 char *str;
2331 /* current output position */
2332 int respos = 0;
2333 int ressize;
2334 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2335 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2336 PyObject *errorHandler = NULL;
2337 PyObject *exc = NULL;
2338 /* the following variable is used for caching string comparisons
2339 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2340 int known_errorHandler = -1;
2341
2342 /* allocate enough for a simple encoding without
2343 replacements, if we need more, we'll resize */
2344 res = PyString_FromStringAndSize(NULL, size);
2345 if (res == NULL)
2346 goto onError;
2347 if (size == 0)
2348 return res;
2349 str = PyString_AS_STRING(res);
2350 ressize = size;
2351
2352 while (p<endp) {
2353 Py_UNICODE c = *p;
2354
2355 /* can we encode this? */
2356 if (c<limit) {
2357 /* no overflow check, because we know that the space is enough */
2358 *str++ = (char)c;
2359 ++p;
2360 }
2361 else {
2362 int unicodepos = p-startp;
2363 int requiredsize;
2364 PyObject *repunicode;
2365 int repsize;
2366 int newpos;
2367 int respos;
2368 Py_UNICODE *uni2;
2369 /* startpos for collecting unencodable chars */
2370 const Py_UNICODE *collstart = p;
2371 const Py_UNICODE *collend = p;
2372 /* find all unecodable characters */
2373 while ((collend < endp) && ((*collend)>=limit))
2374 ++collend;
2375 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2376 if (known_errorHandler==-1) {
2377 if ((errors==NULL) || (!strcmp(errors, "strict")))
2378 known_errorHandler = 1;
2379 else if (!strcmp(errors, "replace"))
2380 known_errorHandler = 2;
2381 else if (!strcmp(errors, "ignore"))
2382 known_errorHandler = 3;
2383 else if (!strcmp(errors, "xmlcharrefreplace"))
2384 known_errorHandler = 4;
2385 else
2386 known_errorHandler = 0;
2387 }
2388 switch (known_errorHandler) {
2389 case 1: /* strict */
2390 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2391 goto onError;
2392 case 2: /* replace */
2393 while (collstart++<collend)
2394 *str++ = '?'; /* fall through */
2395 case 3: /* ignore */
2396 p = collend;
2397 break;
2398 case 4: /* xmlcharrefreplace */
2399 respos = str-PyString_AS_STRING(res);
2400 /* determine replacement size (temporarily (mis)uses p) */
2401 for (p = collstart, repsize = 0; p < collend; ++p) {
2402 if (*p<10)
2403 repsize += 2+1+1;
2404 else if (*p<100)
2405 repsize += 2+2+1;
2406 else if (*p<1000)
2407 repsize += 2+3+1;
2408 else if (*p<10000)
2409 repsize += 2+4+1;
2410 else if (*p<100000)
2411 repsize += 2+5+1;
2412 else if (*p<1000000)
2413 repsize += 2+6+1;
2414 else
2415 repsize += 2+7+1;
2416 }
2417 requiredsize = respos+repsize+(endp-collend);
2418 if (requiredsize > ressize) {
2419 if (requiredsize<2*ressize)
2420 requiredsize = 2*ressize;
2421 if (_PyString_Resize(&res, requiredsize))
2422 goto onError;
2423 str = PyString_AS_STRING(res) + respos;
2424 ressize = requiredsize;
2425 }
2426 /* generate replacement (temporarily (mis)uses p) */
2427 for (p = collstart; p < collend; ++p) {
2428 str += sprintf(str, "&#%d;", (int)*p);
2429 }
2430 p = collend;
2431 break;
2432 default:
2433 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2434 encoding, reason, startp, size, &exc,
2435 collstart-startp, collend-startp, &newpos);
2436 if (repunicode == NULL)
2437 goto onError;
2438 /* need more space? (at least enough for what we
2439 have+the replacement+the rest of the string, so
2440 we won't have to check space for encodable characters) */
2441 respos = str-PyString_AS_STRING(res);
2442 repsize = PyUnicode_GET_SIZE(repunicode);
2443 requiredsize = respos+repsize+(endp-collend);
2444 if (requiredsize > ressize) {
2445 if (requiredsize<2*ressize)
2446 requiredsize = 2*ressize;
2447 if (_PyString_Resize(&res, requiredsize)) {
2448 Py_DECREF(repunicode);
2449 goto onError;
2450 }
2451 str = PyString_AS_STRING(res) + respos;
2452 ressize = requiredsize;
2453 }
2454 /* check if there is anything unencodable in the replacement
2455 and copy it to the output */
2456 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2457 c = *uni2;
2458 if (c >= limit) {
2459 raise_encode_exception(&exc, encoding, startp, size,
2460 unicodepos, unicodepos+1, reason);
2461 Py_DECREF(repunicode);
2462 goto onError;
2463 }
2464 *str = (char)c;
2465 }
2466 p = startp + newpos;
2467 Py_DECREF(repunicode);
2468 }
2469 }
2470 }
2471 /* Resize if we allocated to much */
2472 respos = str-PyString_AS_STRING(res);
2473 if (respos<ressize)
2474 /* If this falls res will be NULL */
2475 _PyString_Resize(&res, respos);
2476 Py_XDECREF(errorHandler);
2477 Py_XDECREF(exc);
2478 return res;
2479
2480 onError:
2481 Py_XDECREF(res);
2482 Py_XDECREF(errorHandler);
2483 Py_XDECREF(exc);
2484 return NULL;
2485}
2486
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2488 int size,
2489 const char *errors)
2490{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002491 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492}
2493
2494PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2495{
2496 if (!PyUnicode_Check(unicode)) {
2497 PyErr_BadArgument();
2498 return NULL;
2499 }
2500 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2501 PyUnicode_GET_SIZE(unicode),
2502 NULL);
2503}
2504
2505/* --- 7-bit ASCII Codec -------------------------------------------------- */
2506
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507PyObject *PyUnicode_DecodeASCII(const char *s,
2508 int size,
2509 const char *errors)
2510{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002511 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512 PyUnicodeObject *v;
2513 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002514 int startinpos;
2515 int endinpos;
2516 int outpos;
2517 const char *e;
2518 PyObject *errorHandler = NULL;
2519 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002520
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002522 if (size == 1 && *(unsigned char*)s < 128) {
2523 Py_UNICODE r = *(unsigned char*)s;
2524 return PyUnicode_FromUnicode(&r, 1);
2525 }
Tim Petersced69f82003-09-16 20:30:58 +00002526
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527 v = _PyUnicode_New(size);
2528 if (v == NULL)
2529 goto onError;
2530 if (size == 0)
2531 return (PyObject *)v;
2532 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002533 e = s + size;
2534 while (s < e) {
2535 register unsigned char c = (unsigned char)*s;
2536 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002538 ++s;
2539 }
2540 else {
2541 startinpos = s-starts;
2542 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002543 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002544 if (unicode_decode_call_errorhandler(
2545 errors, &errorHandler,
2546 "ascii", "ordinal not in range(128)",
2547 starts, size, &startinpos, &endinpos, &exc, &s,
2548 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002552 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002553 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002554 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002555 Py_XDECREF(errorHandler);
2556 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002558
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 onError:
2560 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002561 Py_XDECREF(errorHandler);
2562 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563 return NULL;
2564}
2565
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2567 int size,
2568 const char *errors)
2569{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002570 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002571}
2572
2573PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2574{
2575 if (!PyUnicode_Check(unicode)) {
2576 PyErr_BadArgument();
2577 return NULL;
2578 }
2579 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2580 PyUnicode_GET_SIZE(unicode),
2581 NULL);
2582}
2583
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002584#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002585
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002586/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002587
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002588PyObject *PyUnicode_DecodeMBCS(const char *s,
2589 int size,
2590 const char *errors)
2591{
2592 PyUnicodeObject *v;
2593 Py_UNICODE *p;
2594
2595 /* First get the size of the result */
2596 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002597 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002598 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2599
2600 v = _PyUnicode_New(usize);
2601 if (v == NULL)
2602 return NULL;
2603 if (usize == 0)
2604 return (PyObject *)v;
2605 p = PyUnicode_AS_UNICODE(v);
2606 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2607 Py_DECREF(v);
2608 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2609 }
2610
2611 return (PyObject *)v;
2612}
2613
2614PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2615 int size,
2616 const char *errors)
2617{
2618 PyObject *repr;
2619 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002620 DWORD mbcssize;
2621
2622 /* If there are no characters, bail now! */
2623 if (size==0)
2624 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002625
2626 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002627 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002628 if (mbcssize==0)
2629 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2630
2631 repr = PyString_FromStringAndSize(NULL, mbcssize);
2632 if (repr == NULL)
2633 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002634 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002635 return repr;
2636
2637 /* Do the conversion */
2638 s = PyString_AS_STRING(repr);
2639 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2640 Py_DECREF(repr);
2641 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2642 }
2643 return repr;
2644}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002645
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002646PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2647{
2648 if (!PyUnicode_Check(unicode)) {
2649 PyErr_BadArgument();
2650 return NULL;
2651 }
2652 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2653 PyUnicode_GET_SIZE(unicode),
2654 NULL);
2655}
2656
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002657#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002658
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659/* --- Character Mapping Codec -------------------------------------------- */
2660
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661PyObject *PyUnicode_DecodeCharmap(const char *s,
2662 int size,
2663 PyObject *mapping,
2664 const char *errors)
2665{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002666 const char *starts = s;
2667 int startinpos;
2668 int endinpos;
2669 int outpos;
2670 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 PyUnicodeObject *v;
2672 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002673 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002674 PyObject *errorHandler = NULL;
2675 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002676
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 /* Default to Latin-1 */
2678 if (mapping == NULL)
2679 return PyUnicode_DecodeLatin1(s, size, errors);
2680
2681 v = _PyUnicode_New(size);
2682 if (v == NULL)
2683 goto onError;
2684 if (size == 0)
2685 return (PyObject *)v;
2686 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002687 e = s + size;
2688 while (s < e) {
2689 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 PyObject *w, *x;
2691
2692 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2693 w = PyInt_FromLong((long)ch);
2694 if (w == NULL)
2695 goto onError;
2696 x = PyObject_GetItem(mapping, w);
2697 Py_DECREF(w);
2698 if (x == NULL) {
2699 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002700 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002702 x = Py_None;
2703 Py_INCREF(x);
2704 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002705 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 }
2707
2708 /* Apply mapping */
2709 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002710 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002711 if (value < 0 || value > 65535) {
2712 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002713 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714 Py_DECREF(x);
2715 goto onError;
2716 }
2717 *p++ = (Py_UNICODE)value;
2718 }
2719 else if (x == Py_None) {
2720 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002721 outpos = p-PyUnicode_AS_UNICODE(v);
2722 startinpos = s-starts;
2723 endinpos = startinpos+1;
2724 if (unicode_decode_call_errorhandler(
2725 errors, &errorHandler,
2726 "charmap", "character maps to <undefined>",
2727 starts, size, &startinpos, &endinpos, &exc, &s,
2728 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729 Py_DECREF(x);
2730 goto onError;
2731 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002732 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733 }
2734 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002735 int targetsize = PyUnicode_GET_SIZE(x);
2736
2737 if (targetsize == 1)
2738 /* 1-1 mapping */
2739 *p++ = *PyUnicode_AS_UNICODE(x);
2740
2741 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002743 if (targetsize > extrachars) {
2744 /* resize first */
2745 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2746 int needed = (targetsize - extrachars) + \
2747 (targetsize << 2);
2748 extrachars += needed;
Tim Petersced69f82003-09-16 20:30:58 +00002749 if (_PyUnicode_Resize(&v,
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002750 PyUnicode_GET_SIZE(v) + needed) < 0) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002751 Py_DECREF(x);
2752 goto onError;
2753 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002754 p = PyUnicode_AS_UNICODE(v) + oldpos;
2755 }
2756 Py_UNICODE_COPY(p,
2757 PyUnicode_AS_UNICODE(x),
2758 targetsize);
2759 p += targetsize;
2760 extrachars -= targetsize;
2761 }
2762 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763 }
2764 else {
2765 /* wrong return value */
2766 PyErr_SetString(PyExc_TypeError,
2767 "character mapping must return integer, None or unicode");
2768 Py_DECREF(x);
2769 goto onError;
2770 }
2771 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002772 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 }
2774 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002775 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002777 Py_XDECREF(errorHandler);
2778 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002780
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002782 Py_XDECREF(errorHandler);
2783 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 Py_XDECREF(v);
2785 return NULL;
2786}
2787
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002788/* Lookup the character ch in the mapping. If the character
2789 can't be found, Py_None is returned (or NULL, if another
2790 error occured). */
2791static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002793 PyObject *w = PyInt_FromLong((long)c);
2794 PyObject *x;
2795
2796 if (w == NULL)
2797 return NULL;
2798 x = PyObject_GetItem(mapping, w);
2799 Py_DECREF(w);
2800 if (x == NULL) {
2801 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2802 /* No mapping found means: mapping is undefined. */
2803 PyErr_Clear();
2804 x = Py_None;
2805 Py_INCREF(x);
2806 return x;
2807 } else
2808 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002810 else if (x == Py_None)
2811 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002812 else if (PyInt_Check(x)) {
2813 long value = PyInt_AS_LONG(x);
2814 if (value < 0 || value > 255) {
2815 PyErr_SetString(PyExc_TypeError,
2816 "character mapping must be in range(256)");
2817 Py_DECREF(x);
2818 return NULL;
2819 }
2820 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822 else if (PyString_Check(x))
2823 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002825 /* wrong return value */
2826 PyErr_SetString(PyExc_TypeError,
2827 "character mapping must return integer, None or str");
2828 Py_DECREF(x);
2829 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830 }
2831}
2832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002833/* lookup the character, put the result in the output string and adjust
2834 various state variables. Reallocate the output string if not enough
2835 space is available. Return a new reference to the object that
2836 was put in the output buffer, or Py_None, if the mapping was undefined
2837 (in which case no character was written) or NULL, if a
2838 reallocation error ocurred. The called must decref the result */
2839static
2840PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2841 PyObject **outobj, int *outpos)
2842{
2843 PyObject *rep = charmapencode_lookup(c, mapping);
2844
2845 if (rep==NULL)
2846 return NULL;
2847 else if (rep==Py_None)
2848 return rep;
2849 else {
2850 char *outstart = PyString_AS_STRING(*outobj);
2851 int outsize = PyString_GET_SIZE(*outobj);
2852 if (PyInt_Check(rep)) {
2853 int requiredsize = *outpos+1;
2854 if (outsize<requiredsize) {
2855 /* exponentially overallocate to minimize reallocations */
2856 if (requiredsize < 2*outsize)
2857 requiredsize = 2*outsize;
2858 if (_PyString_Resize(outobj, requiredsize)) {
2859 Py_DECREF(rep);
2860 return NULL;
2861 }
2862 outstart = PyString_AS_STRING(*outobj);
2863 }
2864 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2865 }
2866 else {
2867 const char *repchars = PyString_AS_STRING(rep);
2868 int repsize = PyString_GET_SIZE(rep);
2869 int requiredsize = *outpos+repsize;
2870 if (outsize<requiredsize) {
2871 /* exponentially overallocate to minimize reallocations */
2872 if (requiredsize < 2*outsize)
2873 requiredsize = 2*outsize;
2874 if (_PyString_Resize(outobj, requiredsize)) {
2875 Py_DECREF(rep);
2876 return NULL;
2877 }
2878 outstart = PyString_AS_STRING(*outobj);
2879 }
2880 memcpy(outstart + *outpos, repchars, repsize);
2881 *outpos += repsize;
2882 }
2883 }
2884 return rep;
2885}
2886
2887/* handle an error in PyUnicode_EncodeCharmap
2888 Return 0 on success, -1 on error */
2889static
2890int charmap_encoding_error(
2891 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2892 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002893 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002894 PyObject **res, int *respos)
2895{
2896 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2897 int repsize;
2898 int newpos;
2899 Py_UNICODE *uni2;
2900 /* startpos for collecting unencodable chars */
2901 int collstartpos = *inpos;
2902 int collendpos = *inpos+1;
2903 int collpos;
2904 char *encoding = "charmap";
2905 char *reason = "character maps to <undefined>";
2906
2907 PyObject *x;
2908 /* find all unencodable characters */
2909 while (collendpos < size) {
2910 x = charmapencode_lookup(p[collendpos], mapping);
2911 if (x==NULL)
2912 return -1;
2913 else if (x!=Py_None) {
2914 Py_DECREF(x);
2915 break;
2916 }
2917 Py_DECREF(x);
2918 ++collendpos;
2919 }
2920 /* cache callback name lookup
2921 * (if not done yet, i.e. it's the first error) */
2922 if (*known_errorHandler==-1) {
2923 if ((errors==NULL) || (!strcmp(errors, "strict")))
2924 *known_errorHandler = 1;
2925 else if (!strcmp(errors, "replace"))
2926 *known_errorHandler = 2;
2927 else if (!strcmp(errors, "ignore"))
2928 *known_errorHandler = 3;
2929 else if (!strcmp(errors, "xmlcharrefreplace"))
2930 *known_errorHandler = 4;
2931 else
2932 *known_errorHandler = 0;
2933 }
2934 switch (*known_errorHandler) {
2935 case 1: /* strict */
2936 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2937 return -1;
2938 case 2: /* replace */
2939 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2940 x = charmapencode_output('?', mapping, res, respos);
2941 if (x==NULL) {
2942 return -1;
2943 }
2944 else if (x==Py_None) {
2945 Py_DECREF(x);
2946 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2947 return -1;
2948 }
2949 Py_DECREF(x);
2950 }
2951 /* fall through */
2952 case 3: /* ignore */
2953 *inpos = collendpos;
2954 break;
2955 case 4: /* xmlcharrefreplace */
2956 /* generate replacement (temporarily (mis)uses p) */
2957 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2958 char buffer[2+29+1+1];
2959 char *cp;
2960 sprintf(buffer, "&#%d;", (int)p[collpos]);
2961 for (cp = buffer; *cp; ++cp) {
2962 x = charmapencode_output(*cp, mapping, res, respos);
2963 if (x==NULL)
2964 return -1;
2965 else if (x==Py_None) {
2966 Py_DECREF(x);
2967 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2968 return -1;
2969 }
2970 Py_DECREF(x);
2971 }
2972 }
2973 *inpos = collendpos;
2974 break;
2975 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002976 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002977 encoding, reason, p, size, exceptionObject,
2978 collstartpos, collendpos, &newpos);
2979 if (repunicode == NULL)
2980 return -1;
2981 /* generate replacement */
2982 repsize = PyUnicode_GET_SIZE(repunicode);
2983 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2984 x = charmapencode_output(*uni2, mapping, res, respos);
2985 if (x==NULL) {
2986 Py_DECREF(repunicode);
2987 return -1;
2988 }
2989 else if (x==Py_None) {
2990 Py_DECREF(repunicode);
2991 Py_DECREF(x);
2992 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2993 return -1;
2994 }
2995 Py_DECREF(x);
2996 }
2997 *inpos = newpos;
2998 Py_DECREF(repunicode);
2999 }
3000 return 0;
3001}
3002
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3004 int size,
3005 PyObject *mapping,
3006 const char *errors)
3007{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003008 /* output object */
3009 PyObject *res = NULL;
3010 /* current input position */
3011 int inpos = 0;
3012 /* current output position */
3013 int respos = 0;
3014 PyObject *errorHandler = NULL;
3015 PyObject *exc = NULL;
3016 /* the following variable is used for caching string comparisons
3017 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3018 * 3=ignore, 4=xmlcharrefreplace */
3019 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020
3021 /* Default to Latin-1 */
3022 if (mapping == NULL)
3023 return PyUnicode_EncodeLatin1(p, size, errors);
3024
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003025 /* allocate enough for a simple encoding without
3026 replacements, if we need more, we'll resize */
3027 res = PyString_FromStringAndSize(NULL, size);
3028 if (res == NULL)
3029 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003030 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003031 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003033 while (inpos<size) {
3034 /* try to encode it */
3035 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3036 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003038 if (x==Py_None) { /* unencodable character */
3039 if (charmap_encoding_error(p, size, &inpos, mapping,
3040 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003041 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003042 &res, &respos)) {
3043 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003044 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003047 else
3048 /* done with this character => adjust input position */
3049 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 Py_DECREF(x);
3051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 /* Resize if we allocated to much */
3054 if (respos<PyString_GET_SIZE(res)) {
3055 if (_PyString_Resize(&res, respos))
3056 goto onError;
3057 }
3058 Py_XDECREF(exc);
3059 Py_XDECREF(errorHandler);
3060 return res;
3061
3062 onError:
3063 Py_XDECREF(res);
3064 Py_XDECREF(exc);
3065 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003066 return NULL;
3067}
3068
3069PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3070 PyObject *mapping)
3071{
3072 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3073 PyErr_BadArgument();
3074 return NULL;
3075 }
3076 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3077 PyUnicode_GET_SIZE(unicode),
3078 mapping,
3079 NULL);
3080}
3081
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003082/* create or adjust a UnicodeTranslateError */
3083static void make_translate_exception(PyObject **exceptionObject,
3084 const Py_UNICODE *unicode, int size,
3085 int startpos, int endpos,
3086 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003088 if (*exceptionObject == NULL) {
3089 *exceptionObject = PyUnicodeTranslateError_Create(
3090 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 }
3092 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003093 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3094 goto onError;
3095 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3096 goto onError;
3097 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3098 goto onError;
3099 return;
3100 onError:
3101 Py_DECREF(*exceptionObject);
3102 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103 }
3104}
3105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003106/* raises a UnicodeTranslateError */
3107static void raise_translate_exception(PyObject **exceptionObject,
3108 const Py_UNICODE *unicode, int size,
3109 int startpos, int endpos,
3110 const char *reason)
3111{
3112 make_translate_exception(exceptionObject,
3113 unicode, size, startpos, endpos, reason);
3114 if (*exceptionObject != NULL)
3115 PyCodec_StrictErrors(*exceptionObject);
3116}
3117
3118/* error handling callback helper:
3119 build arguments, call the callback and check the arguments,
3120 put the result into newpos and return the replacement string, which
3121 has to be freed by the caller */
3122static PyObject *unicode_translate_call_errorhandler(const char *errors,
3123 PyObject **errorHandler,
3124 const char *reason,
3125 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3126 int startpos, int endpos,
3127 int *newpos)
3128{
3129 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3130
3131 PyObject *restuple;
3132 PyObject *resunicode;
3133
3134 if (*errorHandler == NULL) {
3135 *errorHandler = PyCodec_LookupError(errors);
3136 if (*errorHandler == NULL)
3137 return NULL;
3138 }
3139
3140 make_translate_exception(exceptionObject,
3141 unicode, size, startpos, endpos, reason);
3142 if (*exceptionObject == NULL)
3143 return NULL;
3144
3145 restuple = PyObject_CallFunctionObjArgs(
3146 *errorHandler, *exceptionObject, NULL);
3147 if (restuple == NULL)
3148 return NULL;
3149 if (!PyTuple_Check(restuple)) {
3150 PyErr_Format(PyExc_TypeError, &argparse[4]);
3151 Py_DECREF(restuple);
3152 return NULL;
3153 }
3154 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3155 &resunicode, newpos)) {
3156 Py_DECREF(restuple);
3157 return NULL;
3158 }
3159 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003160 *newpos = size+*newpos;
3161 if (*newpos<0 || *newpos>size) {
3162 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3163 Py_DECREF(restuple);
3164 return NULL;
3165 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003166 Py_INCREF(resunicode);
3167 Py_DECREF(restuple);
3168 return resunicode;
3169}
3170
3171/* Lookup the character ch in the mapping and put the result in result,
3172 which must be decrefed by the caller.
3173 Return 0 on success, -1 on error */
3174static
3175int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3176{
3177 PyObject *w = PyInt_FromLong((long)c);
3178 PyObject *x;
3179
3180 if (w == NULL)
3181 return -1;
3182 x = PyObject_GetItem(mapping, w);
3183 Py_DECREF(w);
3184 if (x == NULL) {
3185 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3186 /* No mapping found means: use 1:1 mapping. */
3187 PyErr_Clear();
3188 *result = NULL;
3189 return 0;
3190 } else
3191 return -1;
3192 }
3193 else if (x == Py_None) {
3194 *result = x;
3195 return 0;
3196 }
3197 else if (PyInt_Check(x)) {
3198 long value = PyInt_AS_LONG(x);
3199 long max = PyUnicode_GetMax();
3200 if (value < 0 || value > max) {
3201 PyErr_Format(PyExc_TypeError,
3202 "character mapping must be in range(0x%lx)", max+1);
3203 Py_DECREF(x);
3204 return -1;
3205 }
3206 *result = x;
3207 return 0;
3208 }
3209 else if (PyUnicode_Check(x)) {
3210 *result = x;
3211 return 0;
3212 }
3213 else {
3214 /* wrong return value */
3215 PyErr_SetString(PyExc_TypeError,
3216 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003217 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003218 return -1;
3219 }
3220}
3221/* ensure that *outobj is at least requiredsize characters long,
3222if not reallocate and adjust various state variables.
3223Return 0 on success, -1 on error */
3224static
Walter Dörwald4894c302003-10-24 14:25:28 +00003225int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003226 int requiredsize)
3227{
Walter Dörwald4894c302003-10-24 14:25:28 +00003228 int oldsize = PyUnicode_GET_SIZE(*outobj);
3229 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003230 /* remember old output position */
3231 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3232 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003233 if (requiredsize < 2 * oldsize)
3234 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003235 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236 return -1;
3237 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003238 }
3239 return 0;
3240}
3241/* lookup the character, put the result in the output string and adjust
3242 various state variables. Return a new reference to the object that
3243 was put in the output buffer in *result, or Py_None, if the mapping was
3244 undefined (in which case no character was written).
3245 The called must decref result.
3246 Return 0 on success, -1 on error. */
3247static
Walter Dörwald4894c302003-10-24 14:25:28 +00003248int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3249 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3250 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003251{
Walter Dörwald4894c302003-10-24 14:25:28 +00003252 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003253 return -1;
3254 if (*res==NULL) {
3255 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003256 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003257 }
3258 else if (*res==Py_None)
3259 ;
3260 else if (PyInt_Check(*res)) {
3261 /* no overflow check, because we know that the space is enough */
3262 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3263 }
3264 else if (PyUnicode_Check(*res)) {
3265 int repsize = PyUnicode_GET_SIZE(*res);
3266 if (repsize==1) {
3267 /* no overflow check, because we know that the space is enough */
3268 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3269 }
3270 else if (repsize!=0) {
3271 /* more than one character */
Walter Dörwald4894c302003-10-24 14:25:28 +00003272 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
3273 (insize - (*curinp-*startinp)) +
3274 repsize - 1;
3275 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003276 return -1;
3277 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3278 *outp += repsize;
3279 }
3280 }
3281 else
3282 return -1;
3283 return 0;
3284}
3285
3286PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287 int size,
3288 PyObject *mapping,
3289 const char *errors)
3290{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003291 /* output object */
3292 PyObject *res = NULL;
3293 /* pointers to the beginning and end+1 of input */
3294 const Py_UNICODE *startp = p;
3295 const Py_UNICODE *endp = p + size;
3296 /* pointer into the output */
3297 Py_UNICODE *str;
3298 /* current output position */
3299 int respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003300 char *reason = "character maps to <undefined>";
3301 PyObject *errorHandler = NULL;
3302 PyObject *exc = NULL;
3303 /* the following variable is used for caching string comparisons
3304 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3305 * 3=ignore, 4=xmlcharrefreplace */
3306 int known_errorHandler = -1;
3307
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308 if (mapping == NULL) {
3309 PyErr_BadArgument();
3310 return NULL;
3311 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003312
3313 /* allocate enough for a simple 1:1 translation without
3314 replacements, if we need more, we'll resize */
3315 res = PyUnicode_FromUnicode(NULL, size);
3316 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003317 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003319 return res;
3320 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003322 while (p<endp) {
3323 /* try to encode it */
3324 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003325 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003326 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 goto onError;
3328 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003329 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330 if (x!=Py_None) /* it worked => adjust input pointer */
3331 ++p;
3332 else { /* untranslatable character */
3333 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3334 int repsize;
3335 int newpos;
3336 Py_UNICODE *uni2;
3337 /* startpos for collecting untranslatable chars */
3338 const Py_UNICODE *collstart = p;
3339 const Py_UNICODE *collend = p+1;
3340 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342 /* find all untranslatable characters */
3343 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003344 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003345 goto onError;
3346 Py_XDECREF(x);
3347 if (x!=Py_None)
3348 break;
3349 ++collend;
3350 }
3351 /* cache callback name lookup
3352 * (if not done yet, i.e. it's the first error) */
3353 if (known_errorHandler==-1) {
3354 if ((errors==NULL) || (!strcmp(errors, "strict")))
3355 known_errorHandler = 1;
3356 else if (!strcmp(errors, "replace"))
3357 known_errorHandler = 2;
3358 else if (!strcmp(errors, "ignore"))
3359 known_errorHandler = 3;
3360 else if (!strcmp(errors, "xmlcharrefreplace"))
3361 known_errorHandler = 4;
3362 else
3363 known_errorHandler = 0;
3364 }
3365 switch (known_errorHandler) {
3366 case 1: /* strict */
3367 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3368 goto onError;
3369 case 2: /* replace */
3370 /* No need to check for space, this is a 1:1 replacement */
3371 for (coll = collstart; coll<collend; ++coll)
3372 *str++ = '?';
3373 /* fall through */
3374 case 3: /* ignore */
3375 p = collend;
3376 break;
3377 case 4: /* xmlcharrefreplace */
3378 /* generate replacement (temporarily (mis)uses p) */
3379 for (p = collstart; p < collend; ++p) {
3380 char buffer[2+29+1+1];
3381 char *cp;
3382 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003383 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003384 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3385 goto onError;
3386 for (cp = buffer; *cp; ++cp)
3387 *str++ = *cp;
3388 }
3389 p = collend;
3390 break;
3391 default:
3392 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3393 reason, startp, size, &exc,
3394 collstart-startp, collend-startp, &newpos);
3395 if (repunicode == NULL)
3396 goto onError;
3397 /* generate replacement */
3398 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003399 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003400 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3401 Py_DECREF(repunicode);
3402 goto onError;
3403 }
3404 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3405 *str++ = *uni2;
3406 p = startp + newpos;
3407 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408 }
3409 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003411 /* Resize if we allocated to much */
3412 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003413 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003414 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003415 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003416 }
3417 Py_XDECREF(exc);
3418 Py_XDECREF(errorHandler);
3419 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003421 onError:
3422 Py_XDECREF(res);
3423 Py_XDECREF(exc);
3424 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425 return NULL;
3426}
3427
3428PyObject *PyUnicode_Translate(PyObject *str,
3429 PyObject *mapping,
3430 const char *errors)
3431{
3432 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003433
Guido van Rossumd57fd912000-03-10 22:53:23 +00003434 str = PyUnicode_FromObject(str);
3435 if (str == NULL)
3436 goto onError;
3437 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3438 PyUnicode_GET_SIZE(str),
3439 mapping,
3440 errors);
3441 Py_DECREF(str);
3442 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003443
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444 onError:
3445 Py_XDECREF(str);
3446 return NULL;
3447}
Tim Petersced69f82003-09-16 20:30:58 +00003448
Guido van Rossum9e896b32000-04-05 20:11:21 +00003449/* --- Decimal Encoder ---------------------------------------------------- */
3450
3451int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3452 int length,
3453 char *output,
3454 const char *errors)
3455{
3456 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003457 PyObject *errorHandler = NULL;
3458 PyObject *exc = NULL;
3459 const char *encoding = "decimal";
3460 const char *reason = "invalid decimal Unicode string";
3461 /* the following variable is used for caching string comparisons
3462 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3463 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003464
3465 if (output == NULL) {
3466 PyErr_BadArgument();
3467 return -1;
3468 }
3469
3470 p = s;
3471 end = s + length;
3472 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003473 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003474 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003475 PyObject *repunicode;
3476 int repsize;
3477 int newpos;
3478 Py_UNICODE *uni2;
3479 Py_UNICODE *collstart;
3480 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003481
Guido van Rossum9e896b32000-04-05 20:11:21 +00003482 if (Py_UNICODE_ISSPACE(ch)) {
3483 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003484 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003485 continue;
3486 }
3487 decimal = Py_UNICODE_TODECIMAL(ch);
3488 if (decimal >= 0) {
3489 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003490 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003491 continue;
3492 }
Guido van Rossumba477042000-04-06 18:18:10 +00003493 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003494 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003495 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003496 continue;
3497 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498 /* All other characters are considered unencodable */
3499 collstart = p;
3500 collend = p+1;
3501 while (collend < end) {
3502 if ((0 < *collend && *collend < 256) ||
3503 !Py_UNICODE_ISSPACE(*collend) ||
3504 Py_UNICODE_TODECIMAL(*collend))
3505 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003506 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507 /* cache callback name lookup
3508 * (if not done yet, i.e. it's the first error) */
3509 if (known_errorHandler==-1) {
3510 if ((errors==NULL) || (!strcmp(errors, "strict")))
3511 known_errorHandler = 1;
3512 else if (!strcmp(errors, "replace"))
3513 known_errorHandler = 2;
3514 else if (!strcmp(errors, "ignore"))
3515 known_errorHandler = 3;
3516 else if (!strcmp(errors, "xmlcharrefreplace"))
3517 known_errorHandler = 4;
3518 else
3519 known_errorHandler = 0;
3520 }
3521 switch (known_errorHandler) {
3522 case 1: /* strict */
3523 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3524 goto onError;
3525 case 2: /* replace */
3526 for (p = collstart; p < collend; ++p)
3527 *output++ = '?';
3528 /* fall through */
3529 case 3: /* ignore */
3530 p = collend;
3531 break;
3532 case 4: /* xmlcharrefreplace */
3533 /* generate replacement (temporarily (mis)uses p) */
3534 for (p = collstart; p < collend; ++p)
3535 output += sprintf(output, "&#%d;", (int)*p);
3536 p = collend;
3537 break;
3538 default:
3539 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3540 encoding, reason, s, length, &exc,
3541 collstart-s, collend-s, &newpos);
3542 if (repunicode == NULL)
3543 goto onError;
3544 /* generate replacement */
3545 repsize = PyUnicode_GET_SIZE(repunicode);
3546 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3547 Py_UNICODE ch = *uni2;
3548 if (Py_UNICODE_ISSPACE(ch))
3549 *output++ = ' ';
3550 else {
3551 decimal = Py_UNICODE_TODECIMAL(ch);
3552 if (decimal >= 0)
3553 *output++ = '0' + decimal;
3554 else if (0 < ch && ch < 256)
3555 *output++ = (char)ch;
3556 else {
3557 Py_DECREF(repunicode);
3558 raise_encode_exception(&exc, encoding,
3559 s, length, collstart-s, collend-s, reason);
3560 goto onError;
3561 }
3562 }
3563 }
3564 p = s + newpos;
3565 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003566 }
3567 }
3568 /* 0-terminate the output string */
3569 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 Py_XDECREF(exc);
3571 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003572 return 0;
3573
3574 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003575 Py_XDECREF(exc);
3576 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003577 return -1;
3578}
3579
Guido van Rossumd57fd912000-03-10 22:53:23 +00003580/* --- Helpers ------------------------------------------------------------ */
3581
Tim Petersced69f82003-09-16 20:30:58 +00003582static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583int count(PyUnicodeObject *self,
3584 int start,
3585 int end,
3586 PyUnicodeObject *substring)
3587{
3588 int count = 0;
3589
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003590 if (start < 0)
3591 start += self->length;
3592 if (start < 0)
3593 start = 0;
3594 if (end > self->length)
3595 end = self->length;
3596 if (end < 0)
3597 end += self->length;
3598 if (end < 0)
3599 end = 0;
3600
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003601 if (substring->length == 0)
3602 return (end - start + 1);
3603
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604 end -= substring->length;
3605
3606 while (start <= end)
3607 if (Py_UNICODE_MATCH(self, start, substring)) {
3608 count++;
3609 start += substring->length;
3610 } else
3611 start++;
3612
3613 return count;
3614}
3615
3616int PyUnicode_Count(PyObject *str,
3617 PyObject *substr,
3618 int start,
3619 int end)
3620{
3621 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003622
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623 str = PyUnicode_FromObject(str);
3624 if (str == NULL)
3625 return -1;
3626 substr = PyUnicode_FromObject(substr);
3627 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003628 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003629 return -1;
3630 }
Tim Petersced69f82003-09-16 20:30:58 +00003631
Guido van Rossumd57fd912000-03-10 22:53:23 +00003632 result = count((PyUnicodeObject *)str,
3633 start, end,
3634 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003635
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636 Py_DECREF(str);
3637 Py_DECREF(substr);
3638 return result;
3639}
3640
Tim Petersced69f82003-09-16 20:30:58 +00003641static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642int findstring(PyUnicodeObject *self,
3643 PyUnicodeObject *substring,
3644 int start,
3645 int end,
3646 int direction)
3647{
3648 if (start < 0)
3649 start += self->length;
3650 if (start < 0)
3651 start = 0;
3652
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 if (end > self->length)
3654 end = self->length;
3655 if (end < 0)
3656 end += self->length;
3657 if (end < 0)
3658 end = 0;
3659
Guido van Rossum76afbd92002-08-20 17:29:29 +00003660 if (substring->length == 0)
3661 return (direction > 0) ? start : end;
3662
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 end -= substring->length;
3664
3665 if (direction < 0) {
3666 for (; end >= start; end--)
3667 if (Py_UNICODE_MATCH(self, end, substring))
3668 return end;
3669 } else {
3670 for (; start <= end; start++)
3671 if (Py_UNICODE_MATCH(self, start, substring))
3672 return start;
3673 }
3674
3675 return -1;
3676}
3677
3678int PyUnicode_Find(PyObject *str,
3679 PyObject *substr,
3680 int start,
3681 int end,
3682 int direction)
3683{
3684 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003685
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686 str = PyUnicode_FromObject(str);
3687 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003688 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 substr = PyUnicode_FromObject(substr);
3690 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003691 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003692 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693 }
Tim Petersced69f82003-09-16 20:30:58 +00003694
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 result = findstring((PyUnicodeObject *)str,
3696 (PyUnicodeObject *)substr,
3697 start, end, direction);
3698 Py_DECREF(str);
3699 Py_DECREF(substr);
3700 return result;
3701}
3702
Tim Petersced69f82003-09-16 20:30:58 +00003703static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704int tailmatch(PyUnicodeObject *self,
3705 PyUnicodeObject *substring,
3706 int start,
3707 int end,
3708 int direction)
3709{
3710 if (start < 0)
3711 start += self->length;
3712 if (start < 0)
3713 start = 0;
3714
3715 if (substring->length == 0)
3716 return 1;
3717
3718 if (end > self->length)
3719 end = self->length;
3720 if (end < 0)
3721 end += self->length;
3722 if (end < 0)
3723 end = 0;
3724
3725 end -= substring->length;
3726 if (end < start)
3727 return 0;
3728
3729 if (direction > 0) {
3730 if (Py_UNICODE_MATCH(self, end, substring))
3731 return 1;
3732 } else {
3733 if (Py_UNICODE_MATCH(self, start, substring))
3734 return 1;
3735 }
3736
3737 return 0;
3738}
3739
3740int PyUnicode_Tailmatch(PyObject *str,
3741 PyObject *substr,
3742 int start,
3743 int end,
3744 int direction)
3745{
3746 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003747
Guido van Rossumd57fd912000-03-10 22:53:23 +00003748 str = PyUnicode_FromObject(str);
3749 if (str == NULL)
3750 return -1;
3751 substr = PyUnicode_FromObject(substr);
3752 if (substr == NULL) {
3753 Py_DECREF(substr);
3754 return -1;
3755 }
Tim Petersced69f82003-09-16 20:30:58 +00003756
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757 result = tailmatch((PyUnicodeObject *)str,
3758 (PyUnicodeObject *)substr,
3759 start, end, direction);
3760 Py_DECREF(str);
3761 Py_DECREF(substr);
3762 return result;
3763}
3764
Tim Petersced69f82003-09-16 20:30:58 +00003765static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766const Py_UNICODE *findchar(const Py_UNICODE *s,
3767 int size,
3768 Py_UNICODE ch)
3769{
3770 /* like wcschr, but doesn't stop at NULL characters */
3771
3772 while (size-- > 0) {
3773 if (*s == ch)
3774 return s;
3775 s++;
3776 }
3777
3778 return NULL;
3779}
3780
3781/* Apply fixfct filter to the Unicode object self and return a
3782 reference to the modified object */
3783
Tim Petersced69f82003-09-16 20:30:58 +00003784static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785PyObject *fixup(PyUnicodeObject *self,
3786 int (*fixfct)(PyUnicodeObject *s))
3787{
3788
3789 PyUnicodeObject *u;
3790
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003791 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792 if (u == NULL)
3793 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003794
3795 Py_UNICODE_COPY(u->str, self->str, self->length);
3796
Tim Peters7a29bd52001-09-12 03:03:31 +00003797 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798 /* fixfct should return TRUE if it modified the buffer. If
3799 FALSE, return a reference to the original buffer instead
3800 (to save space, not time) */
3801 Py_INCREF(self);
3802 Py_DECREF(u);
3803 return (PyObject*) self;
3804 }
3805 return (PyObject*) u;
3806}
3807
Tim Petersced69f82003-09-16 20:30:58 +00003808static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809int fixupper(PyUnicodeObject *self)
3810{
3811 int len = self->length;
3812 Py_UNICODE *s = self->str;
3813 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003814
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 while (len-- > 0) {
3816 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003817
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 ch = Py_UNICODE_TOUPPER(*s);
3819 if (ch != *s) {
3820 status = 1;
3821 *s = ch;
3822 }
3823 s++;
3824 }
3825
3826 return status;
3827}
3828
Tim Petersced69f82003-09-16 20:30:58 +00003829static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830int fixlower(PyUnicodeObject *self)
3831{
3832 int len = self->length;
3833 Py_UNICODE *s = self->str;
3834 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003835
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836 while (len-- > 0) {
3837 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003838
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 ch = Py_UNICODE_TOLOWER(*s);
3840 if (ch != *s) {
3841 status = 1;
3842 *s = ch;
3843 }
3844 s++;
3845 }
3846
3847 return status;
3848}
3849
Tim Petersced69f82003-09-16 20:30:58 +00003850static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851int fixswapcase(PyUnicodeObject *self)
3852{
3853 int len = self->length;
3854 Py_UNICODE *s = self->str;
3855 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003856
Guido van Rossumd57fd912000-03-10 22:53:23 +00003857 while (len-- > 0) {
3858 if (Py_UNICODE_ISUPPER(*s)) {
3859 *s = Py_UNICODE_TOLOWER(*s);
3860 status = 1;
3861 } else if (Py_UNICODE_ISLOWER(*s)) {
3862 *s = Py_UNICODE_TOUPPER(*s);
3863 status = 1;
3864 }
3865 s++;
3866 }
3867
3868 return status;
3869}
3870
Tim Petersced69f82003-09-16 20:30:58 +00003871static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872int fixcapitalize(PyUnicodeObject *self)
3873{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003874 int len = self->length;
3875 Py_UNICODE *s = self->str;
3876 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003877
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003878 if (len == 0)
3879 return 0;
3880 if (Py_UNICODE_ISLOWER(*s)) {
3881 *s = Py_UNICODE_TOUPPER(*s);
3882 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003883 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003884 s++;
3885 while (--len > 0) {
3886 if (Py_UNICODE_ISUPPER(*s)) {
3887 *s = Py_UNICODE_TOLOWER(*s);
3888 status = 1;
3889 }
3890 s++;
3891 }
3892 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003893}
3894
3895static
3896int fixtitle(PyUnicodeObject *self)
3897{
3898 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3899 register Py_UNICODE *e;
3900 int previous_is_cased;
3901
3902 /* Shortcut for single character strings */
3903 if (PyUnicode_GET_SIZE(self) == 1) {
3904 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3905 if (*p != ch) {
3906 *p = ch;
3907 return 1;
3908 }
3909 else
3910 return 0;
3911 }
Tim Petersced69f82003-09-16 20:30:58 +00003912
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913 e = p + PyUnicode_GET_SIZE(self);
3914 previous_is_cased = 0;
3915 for (; p < e; p++) {
3916 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00003917
Guido van Rossumd57fd912000-03-10 22:53:23 +00003918 if (previous_is_cased)
3919 *p = Py_UNICODE_TOLOWER(ch);
3920 else
3921 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00003922
3923 if (Py_UNICODE_ISLOWER(ch) ||
3924 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925 Py_UNICODE_ISTITLE(ch))
3926 previous_is_cased = 1;
3927 else
3928 previous_is_cased = 0;
3929 }
3930 return 1;
3931}
3932
3933PyObject *PyUnicode_Join(PyObject *separator,
3934 PyObject *seq)
3935{
3936 Py_UNICODE *sep;
3937 int seplen;
3938 PyUnicodeObject *res = NULL;
3939 int reslen = 0;
3940 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941 int sz = 100;
3942 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003943 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944
Tim Peters2cfe3682001-05-05 05:36:48 +00003945 it = PyObject_GetIter(seq);
3946 if (it == NULL)
3947 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948
3949 if (separator == NULL) {
3950 Py_UNICODE blank = ' ';
3951 sep = &blank;
3952 seplen = 1;
3953 }
3954 else {
3955 separator = PyUnicode_FromObject(separator);
3956 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003957 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 sep = PyUnicode_AS_UNICODE(separator);
3959 seplen = PyUnicode_GET_SIZE(separator);
3960 }
Tim Petersced69f82003-09-16 20:30:58 +00003961
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962 res = _PyUnicode_New(sz);
3963 if (res == NULL)
3964 goto onError;
3965 p = PyUnicode_AS_UNICODE(res);
3966 reslen = 0;
3967
Tim Peters2cfe3682001-05-05 05:36:48 +00003968 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003970 PyObject *item = PyIter_Next(it);
3971 if (item == NULL) {
3972 if (PyErr_Occurred())
3973 goto onError;
3974 break;
3975 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003976 if (!PyUnicode_Check(item)) {
3977 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003978 if (!PyString_Check(item)) {
3979 PyErr_Format(PyExc_TypeError,
3980 "sequence item %i: expected string or Unicode,"
3981 " %.80s found",
3982 i, item->ob_type->tp_name);
3983 Py_DECREF(item);
3984 goto onError;
3985 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986 v = PyUnicode_FromObject(item);
3987 Py_DECREF(item);
3988 item = v;
3989 if (item == NULL)
3990 goto onError;
3991 }
3992 itemlen = PyUnicode_GET_SIZE(item);
3993 while (reslen + itemlen + seplen >= sz) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003994 if (_PyUnicode_Resize(&res, sz*2) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003995 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 sz *= 2;
3999 p = PyUnicode_AS_UNICODE(res) + reslen;
4000 }
4001 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004002 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003 p += seplen;
4004 reslen += seplen;
4005 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004006 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007 p += itemlen;
4008 reslen += itemlen;
4009 Py_DECREF(item);
4010 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004011 if (_PyUnicode_Resize(&res, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004012 goto onError;
4013
4014 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004015 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016 return (PyObject *)res;
4017
4018 onError:
4019 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004020 Py_XDECREF(res);
4021 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004022 return NULL;
4023}
4024
Tim Petersced69f82003-09-16 20:30:58 +00004025static
4026PyUnicodeObject *pad(PyUnicodeObject *self,
4027 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004028 int right,
4029 Py_UNICODE fill)
4030{
4031 PyUnicodeObject *u;
4032
4033 if (left < 0)
4034 left = 0;
4035 if (right < 0)
4036 right = 0;
4037
Tim Peters7a29bd52001-09-12 03:03:31 +00004038 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004039 Py_INCREF(self);
4040 return self;
4041 }
4042
4043 u = _PyUnicode_New(left + self->length + right);
4044 if (u) {
4045 if (left)
4046 Py_UNICODE_FILL(u->str, fill, left);
4047 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4048 if (right)
4049 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4050 }
4051
4052 return u;
4053}
4054
4055#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004056 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057 if (!str) \
4058 goto onError; \
4059 if (PyList_Append(list, str)) { \
4060 Py_DECREF(str); \
4061 goto onError; \
4062 } \
4063 else \
4064 Py_DECREF(str);
4065
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004066#define SPLIT_INSERT(data, left, right) \
4067 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4068 if (!str) \
4069 goto onError; \
4070 if (PyList_Insert(list, 0, str)) { \
4071 Py_DECREF(str); \
4072 goto onError; \
4073 } \
4074 else \
4075 Py_DECREF(str);
4076
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077static
4078PyObject *split_whitespace(PyUnicodeObject *self,
4079 PyObject *list,
4080 int maxcount)
4081{
4082 register int i;
4083 register int j;
4084 int len = self->length;
4085 PyObject *str;
4086
4087 for (i = j = 0; i < len; ) {
4088 /* find a token */
4089 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4090 i++;
4091 j = i;
4092 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4093 i++;
4094 if (j < i) {
4095 if (maxcount-- <= 0)
4096 break;
4097 SPLIT_APPEND(self->str, j, i);
4098 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4099 i++;
4100 j = i;
4101 }
4102 }
4103 if (j < len) {
4104 SPLIT_APPEND(self->str, j, len);
4105 }
4106 return list;
4107
4108 onError:
4109 Py_DECREF(list);
4110 return NULL;
4111}
4112
4113PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004114 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115{
4116 register int i;
4117 register int j;
4118 int len;
4119 PyObject *list;
4120 PyObject *str;
4121 Py_UNICODE *data;
4122
4123 string = PyUnicode_FromObject(string);
4124 if (string == NULL)
4125 return NULL;
4126 data = PyUnicode_AS_UNICODE(string);
4127 len = PyUnicode_GET_SIZE(string);
4128
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129 list = PyList_New(0);
4130 if (!list)
4131 goto onError;
4132
4133 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004134 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004135
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136 /* Find a line and append it */
4137 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4138 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139
4140 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004141 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142 if (i < len) {
4143 if (data[i] == '\r' && i + 1 < len &&
4144 data[i+1] == '\n')
4145 i += 2;
4146 else
4147 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004148 if (keepends)
4149 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150 }
Guido van Rossum86662912000-04-11 15:38:46 +00004151 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152 j = i;
4153 }
4154 if (j < len) {
4155 SPLIT_APPEND(data, j, len);
4156 }
4157
4158 Py_DECREF(string);
4159 return list;
4160
4161 onError:
4162 Py_DECREF(list);
4163 Py_DECREF(string);
4164 return NULL;
4165}
4166
Tim Petersced69f82003-09-16 20:30:58 +00004167static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168PyObject *split_char(PyUnicodeObject *self,
4169 PyObject *list,
4170 Py_UNICODE ch,
4171 int maxcount)
4172{
4173 register int i;
4174 register int j;
4175 int len = self->length;
4176 PyObject *str;
4177
4178 for (i = j = 0; i < len; ) {
4179 if (self->str[i] == ch) {
4180 if (maxcount-- <= 0)
4181 break;
4182 SPLIT_APPEND(self->str, j, i);
4183 i = j = i + 1;
4184 } else
4185 i++;
4186 }
4187 if (j <= len) {
4188 SPLIT_APPEND(self->str, j, len);
4189 }
4190 return list;
4191
4192 onError:
4193 Py_DECREF(list);
4194 return NULL;
4195}
4196
Tim Petersced69f82003-09-16 20:30:58 +00004197static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198PyObject *split_substring(PyUnicodeObject *self,
4199 PyObject *list,
4200 PyUnicodeObject *substring,
4201 int maxcount)
4202{
4203 register int i;
4204 register int j;
4205 int len = self->length;
4206 int sublen = substring->length;
4207 PyObject *str;
4208
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004209 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004210 if (Py_UNICODE_MATCH(self, i, substring)) {
4211 if (maxcount-- <= 0)
4212 break;
4213 SPLIT_APPEND(self->str, j, i);
4214 i = j = i + sublen;
4215 } else
4216 i++;
4217 }
4218 if (j <= len) {
4219 SPLIT_APPEND(self->str, j, len);
4220 }
4221 return list;
4222
4223 onError:
4224 Py_DECREF(list);
4225 return NULL;
4226}
4227
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004228static
4229PyObject *rsplit_whitespace(PyUnicodeObject *self,
4230 PyObject *list,
4231 int maxcount)
4232{
4233 register int i;
4234 register int j;
4235 int len = self->length;
4236 PyObject *str;
4237
4238 for (i = j = len - 1; i >= 0; ) {
4239 /* find a token */
4240 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4241 i--;
4242 j = i;
4243 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4244 i--;
4245 if (j > i) {
4246 if (maxcount-- <= 0)
4247 break;
4248 SPLIT_INSERT(self->str, i + 1, j + 1);
4249 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4250 i--;
4251 j = i;
4252 }
4253 }
4254 if (j >= 0) {
4255 SPLIT_INSERT(self->str, 0, j + 1);
4256 }
4257 return list;
4258
4259 onError:
4260 Py_DECREF(list);
4261 return NULL;
4262}
4263
4264static
4265PyObject *rsplit_char(PyUnicodeObject *self,
4266 PyObject *list,
4267 Py_UNICODE ch,
4268 int maxcount)
4269{
4270 register int i;
4271 register int j;
4272 int len = self->length;
4273 PyObject *str;
4274
4275 for (i = j = len - 1; i >= 0; ) {
4276 if (self->str[i] == ch) {
4277 if (maxcount-- <= 0)
4278 break;
4279 SPLIT_INSERT(self->str, i + 1, j + 1);
4280 j = i = i - 1;
4281 } else
4282 i--;
4283 }
4284 if (j >= 0) {
4285 SPLIT_INSERT(self->str, 0, j + 1);
4286 }
4287 return list;
4288
4289 onError:
4290 Py_DECREF(list);
4291 return NULL;
4292}
4293
4294static
4295PyObject *rsplit_substring(PyUnicodeObject *self,
4296 PyObject *list,
4297 PyUnicodeObject *substring,
4298 int maxcount)
4299{
4300 register int i;
4301 register int j;
4302 int len = self->length;
4303 int sublen = substring->length;
4304 PyObject *str;
4305
4306 for (i = len - sublen, j = len; i >= 0; ) {
4307 if (Py_UNICODE_MATCH(self, i, substring)) {
4308 if (maxcount-- <= 0)
4309 break;
4310 SPLIT_INSERT(self->str, i + sublen, j);
4311 j = i;
4312 i -= sublen;
4313 } else
4314 i--;
4315 }
4316 if (j >= 0) {
4317 SPLIT_INSERT(self->str, 0, j);
4318 }
4319 return list;
4320
4321 onError:
4322 Py_DECREF(list);
4323 return NULL;
4324}
4325
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004327#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328
4329static
4330PyObject *split(PyUnicodeObject *self,
4331 PyUnicodeObject *substring,
4332 int maxcount)
4333{
4334 PyObject *list;
4335
4336 if (maxcount < 0)
4337 maxcount = INT_MAX;
4338
4339 list = PyList_New(0);
4340 if (!list)
4341 return NULL;
4342
4343 if (substring == NULL)
4344 return split_whitespace(self,list,maxcount);
4345
4346 else if (substring->length == 1)
4347 return split_char(self,list,substring->str[0],maxcount);
4348
4349 else if (substring->length == 0) {
4350 Py_DECREF(list);
4351 PyErr_SetString(PyExc_ValueError, "empty separator");
4352 return NULL;
4353 }
4354 else
4355 return split_substring(self,list,substring,maxcount);
4356}
4357
Tim Petersced69f82003-09-16 20:30:58 +00004358static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004359PyObject *rsplit(PyUnicodeObject *self,
4360 PyUnicodeObject *substring,
4361 int maxcount)
4362{
4363 PyObject *list;
4364
4365 if (maxcount < 0)
4366 maxcount = INT_MAX;
4367
4368 list = PyList_New(0);
4369 if (!list)
4370 return NULL;
4371
4372 if (substring == NULL)
4373 return rsplit_whitespace(self,list,maxcount);
4374
4375 else if (substring->length == 1)
4376 return rsplit_char(self,list,substring->str[0],maxcount);
4377
4378 else if (substring->length == 0) {
4379 Py_DECREF(list);
4380 PyErr_SetString(PyExc_ValueError, "empty separator");
4381 return NULL;
4382 }
4383 else
4384 return rsplit_substring(self,list,substring,maxcount);
4385}
4386
4387static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388PyObject *replace(PyUnicodeObject *self,
4389 PyUnicodeObject *str1,
4390 PyUnicodeObject *str2,
4391 int maxcount)
4392{
4393 PyUnicodeObject *u;
4394
4395 if (maxcount < 0)
4396 maxcount = INT_MAX;
4397
4398 if (str1->length == 1 && str2->length == 1) {
4399 int i;
4400
4401 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004402 if (!findchar(self->str, self->length, str1->str[0]) &&
4403 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404 /* nothing to replace, return original string */
4405 Py_INCREF(self);
4406 u = self;
4407 } else {
4408 Py_UNICODE u1 = str1->str[0];
4409 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004410
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004412 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413 self->length
4414 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004415 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004416 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004417 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004418 for (i = 0; i < u->length; i++)
4419 if (u->str[i] == u1) {
4420 if (--maxcount < 0)
4421 break;
4422 u->str[i] = u2;
4423 }
4424 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004425 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426
4427 } else {
4428 int n, i;
4429 Py_UNICODE *p;
4430
4431 /* replace strings */
4432 n = count(self, 0, self->length, str1);
4433 if (n > maxcount)
4434 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004435 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004436 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004437 if (PyUnicode_CheckExact(self)) {
4438 Py_INCREF(self);
4439 u = self;
4440 }
4441 else {
4442 u = (PyUnicodeObject *)
4443 PyUnicode_FromUnicode(self->str, self->length);
4444 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445 } else {
4446 u = _PyUnicode_New(
4447 self->length + n * (str2->length - str1->length));
4448 if (u) {
4449 i = 0;
4450 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004451 if (str1->length > 0) {
4452 while (i <= self->length - str1->length)
4453 if (Py_UNICODE_MATCH(self, i, str1)) {
4454 /* replace string segment */
4455 Py_UNICODE_COPY(p, str2->str, str2->length);
4456 p += str2->length;
4457 i += str1->length;
4458 if (--n <= 0) {
4459 /* copy remaining part */
4460 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4461 break;
4462 }
4463 } else
4464 *p++ = self->str[i++];
4465 } else {
4466 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467 Py_UNICODE_COPY(p, str2->str, str2->length);
4468 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004469 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004472 }
4473 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4474 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004475 }
4476 }
4477 }
Tim Petersced69f82003-09-16 20:30:58 +00004478
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479 return (PyObject *) u;
4480}
4481
4482/* --- Unicode Object Methods --------------------------------------------- */
4483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004484PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004485"S.title() -> unicode\n\
4486\n\
4487Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004488characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489
4490static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004491unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004493 return fixup(self, fixtitle);
4494}
4495
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004496PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497"S.capitalize() -> unicode\n\
4498\n\
4499Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004500have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004501
4502static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004503unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004505 return fixup(self, fixcapitalize);
4506}
4507
4508#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004509PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510"S.capwords() -> unicode\n\
4511\n\
4512Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004513normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004514
4515static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004516unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517{
4518 PyObject *list;
4519 PyObject *item;
4520 int i;
4521
Guido van Rossumd57fd912000-03-10 22:53:23 +00004522 /* Split into words */
4523 list = split(self, NULL, -1);
4524 if (!list)
4525 return NULL;
4526
4527 /* Capitalize each word */
4528 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4529 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4530 fixcapitalize);
4531 if (item == NULL)
4532 goto onError;
4533 Py_DECREF(PyList_GET_ITEM(list, i));
4534 PyList_SET_ITEM(list, i, item);
4535 }
4536
4537 /* Join the words to form a new string */
4538 item = PyUnicode_Join(NULL, list);
4539
4540onError:
4541 Py_DECREF(list);
4542 return (PyObject *)item;
4543}
4544#endif
4545
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004546/* Argument converter. Coerces to a single unicode character */
4547
4548static int
4549convert_uc(PyObject *obj, void *addr)
4550{
4551 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4552 PyObject *uniobj;
4553 Py_UNICODE *unistr;
4554
4555 uniobj = PyUnicode_FromObject(obj);
4556 if (uniobj == NULL) {
4557 PyErr_SetString(PyExc_TypeError,
4558 "The fill character cannot be converted to Unicode");
4559 return 0;
4560 }
4561 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4562 PyErr_SetString(PyExc_TypeError,
4563 "The fill character must be exactly one character long");
4564 Py_DECREF(uniobj);
4565 return 0;
4566 }
4567 unistr = PyUnicode_AS_UNICODE(uniobj);
4568 *fillcharloc = unistr[0];
4569 Py_DECREF(uniobj);
4570 return 1;
4571}
4572
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004573PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004574"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004576Return S centered in a Unicode string of length width. Padding is\n\
4577done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578
4579static PyObject *
4580unicode_center(PyUnicodeObject *self, PyObject *args)
4581{
4582 int marg, left;
4583 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004584 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004586 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587 return NULL;
4588
Tim Peters7a29bd52001-09-12 03:03:31 +00004589 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590 Py_INCREF(self);
4591 return (PyObject*) self;
4592 }
4593
4594 marg = width - self->length;
4595 left = marg / 2 + (marg & width & 1);
4596
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004597 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598}
4599
Marc-André Lemburge5034372000-08-08 08:04:29 +00004600#if 0
4601
4602/* This code should go into some future Unicode collation support
4603 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004604 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004605
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004606/* speedy UTF-16 code point order comparison */
4607/* gleaned from: */
4608/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4609
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004610static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004611{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004612 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004613 0, 0, 0, 0, 0, 0, 0, 0,
4614 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004615 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004616};
4617
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618static int
4619unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4620{
4621 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004622
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623 Py_UNICODE *s1 = str1->str;
4624 Py_UNICODE *s2 = str2->str;
4625
4626 len1 = str1->length;
4627 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004628
Guido van Rossumd57fd912000-03-10 22:53:23 +00004629 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004630 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004631
4632 c1 = *s1++;
4633 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004634
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004635 if (c1 > (1<<11) * 26)
4636 c1 += utf16Fixup[c1>>11];
4637 if (c2 > (1<<11) * 26)
4638 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004639 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004640
4641 if (c1 != c2)
4642 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004643
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004644 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645 }
4646
4647 return (len1 < len2) ? -1 : (len1 != len2);
4648}
4649
Marc-André Lemburge5034372000-08-08 08:04:29 +00004650#else
4651
4652static int
4653unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4654{
4655 register int len1, len2;
4656
4657 Py_UNICODE *s1 = str1->str;
4658 Py_UNICODE *s2 = str2->str;
4659
4660 len1 = str1->length;
4661 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004662
Marc-André Lemburge5034372000-08-08 08:04:29 +00004663 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004664 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004665
Fredrik Lundh45714e92001-06-26 16:39:36 +00004666 c1 = *s1++;
4667 c2 = *s2++;
4668
4669 if (c1 != c2)
4670 return (c1 < c2) ? -1 : 1;
4671
Marc-André Lemburge5034372000-08-08 08:04:29 +00004672 len1--; len2--;
4673 }
4674
4675 return (len1 < len2) ? -1 : (len1 != len2);
4676}
4677
4678#endif
4679
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680int PyUnicode_Compare(PyObject *left,
4681 PyObject *right)
4682{
4683 PyUnicodeObject *u = NULL, *v = NULL;
4684 int result;
4685
4686 /* Coerce the two arguments */
4687 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4688 if (u == NULL)
4689 goto onError;
4690 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4691 if (v == NULL)
4692 goto onError;
4693
Thomas Wouters7e474022000-07-16 12:04:32 +00004694 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004695 if (v == u) {
4696 Py_DECREF(u);
4697 Py_DECREF(v);
4698 return 0;
4699 }
4700
4701 result = unicode_compare(u, v);
4702
4703 Py_DECREF(u);
4704 Py_DECREF(v);
4705 return result;
4706
4707onError:
4708 Py_XDECREF(u);
4709 Py_XDECREF(v);
4710 return -1;
4711}
4712
Guido van Rossum403d68b2000-03-13 15:55:09 +00004713int PyUnicode_Contains(PyObject *container,
4714 PyObject *element)
4715{
4716 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004717 int result, size;
4718 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004719
4720 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004721 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004722 if (v == NULL) {
4723 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004724 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004725 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004726 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004727 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004728 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004729 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004730
Barry Warsaw817918c2002-08-06 16:58:21 +00004731 size = PyUnicode_GET_SIZE(v);
4732 rhs = PyUnicode_AS_UNICODE(v);
4733 lhs = PyUnicode_AS_UNICODE(u);
4734
Guido van Rossum403d68b2000-03-13 15:55:09 +00004735 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004736 if (size == 1) {
4737 end = lhs + PyUnicode_GET_SIZE(u);
4738 while (lhs < end) {
4739 if (*lhs++ == *rhs) {
4740 result = 1;
4741 break;
4742 }
4743 }
4744 }
4745 else {
4746 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4747 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004748 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004749 result = 1;
4750 break;
4751 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004752 }
4753 }
4754
4755 Py_DECREF(u);
4756 Py_DECREF(v);
4757 return result;
4758
4759onError:
4760 Py_XDECREF(u);
4761 Py_XDECREF(v);
4762 return -1;
4763}
4764
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765/* Concat to string or Unicode object giving a new Unicode object. */
4766
4767PyObject *PyUnicode_Concat(PyObject *left,
4768 PyObject *right)
4769{
4770 PyUnicodeObject *u = NULL, *v = NULL, *w;
4771
4772 /* Coerce the two arguments */
4773 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4774 if (u == NULL)
4775 goto onError;
4776 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4777 if (v == NULL)
4778 goto onError;
4779
4780 /* Shortcuts */
4781 if (v == unicode_empty) {
4782 Py_DECREF(v);
4783 return (PyObject *)u;
4784 }
4785 if (u == unicode_empty) {
4786 Py_DECREF(u);
4787 return (PyObject *)v;
4788 }
4789
4790 /* Concat the two Unicode strings */
4791 w = _PyUnicode_New(u->length + v->length);
4792 if (w == NULL)
4793 goto onError;
4794 Py_UNICODE_COPY(w->str, u->str, u->length);
4795 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4796
4797 Py_DECREF(u);
4798 Py_DECREF(v);
4799 return (PyObject *)w;
4800
4801onError:
4802 Py_XDECREF(u);
4803 Py_XDECREF(v);
4804 return NULL;
4805}
4806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004807PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808"S.count(sub[, start[, end]]) -> int\n\
4809\n\
4810Return the number of occurrences of substring sub in Unicode string\n\
4811S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004812interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813
4814static PyObject *
4815unicode_count(PyUnicodeObject *self, PyObject *args)
4816{
4817 PyUnicodeObject *substring;
4818 int start = 0;
4819 int end = INT_MAX;
4820 PyObject *result;
4821
Guido van Rossumb8872e62000-05-09 14:14:27 +00004822 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4823 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824 return NULL;
4825
4826 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4827 (PyObject *)substring);
4828 if (substring == NULL)
4829 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004830
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 if (start < 0)
4832 start += self->length;
4833 if (start < 0)
4834 start = 0;
4835 if (end > self->length)
4836 end = self->length;
4837 if (end < 0)
4838 end += self->length;
4839 if (end < 0)
4840 end = 0;
4841
4842 result = PyInt_FromLong((long) count(self, start, end, substring));
4843
4844 Py_DECREF(substring);
4845 return result;
4846}
4847
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004848PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849"S.encode([encoding[,errors]]) -> string\n\
4850\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004851Return an encoded string version of S. Default encoding is the current\n\
4852default string encoding. errors may be given to set a different error\n\
4853handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004854a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4855'xmlcharrefreplace' as well as any other name registered with\n\
4856codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857
4858static PyObject *
4859unicode_encode(PyUnicodeObject *self, PyObject *args)
4860{
4861 char *encoding = NULL;
4862 char *errors = NULL;
4863 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4864 return NULL;
4865 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4866}
4867
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004868PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869"S.expandtabs([tabsize]) -> unicode\n\
4870\n\
4871Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004872If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873
4874static PyObject*
4875unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4876{
4877 Py_UNICODE *e;
4878 Py_UNICODE *p;
4879 Py_UNICODE *q;
4880 int i, j;
4881 PyUnicodeObject *u;
4882 int tabsize = 8;
4883
4884 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4885 return NULL;
4886
Thomas Wouters7e474022000-07-16 12:04:32 +00004887 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888 i = j = 0;
4889 e = self->str + self->length;
4890 for (p = self->str; p < e; p++)
4891 if (*p == '\t') {
4892 if (tabsize > 0)
4893 j += tabsize - (j % tabsize);
4894 }
4895 else {
4896 j++;
4897 if (*p == '\n' || *p == '\r') {
4898 i += j;
4899 j = 0;
4900 }
4901 }
4902
4903 /* Second pass: create output string and fill it */
4904 u = _PyUnicode_New(i + j);
4905 if (!u)
4906 return NULL;
4907
4908 j = 0;
4909 q = u->str;
4910
4911 for (p = self->str; p < e; p++)
4912 if (*p == '\t') {
4913 if (tabsize > 0) {
4914 i = tabsize - (j % tabsize);
4915 j += i;
4916 while (i--)
4917 *q++ = ' ';
4918 }
4919 }
4920 else {
4921 j++;
4922 *q++ = *p;
4923 if (*p == '\n' || *p == '\r')
4924 j = 0;
4925 }
4926
4927 return (PyObject*) u;
4928}
4929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004930PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931"S.find(sub [,start [,end]]) -> int\n\
4932\n\
4933Return the lowest index in S where substring sub is found,\n\
4934such that sub is contained within s[start,end]. Optional\n\
4935arguments start and end are interpreted as in slice notation.\n\
4936\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004937Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938
4939static PyObject *
4940unicode_find(PyUnicodeObject *self, PyObject *args)
4941{
4942 PyUnicodeObject *substring;
4943 int start = 0;
4944 int end = INT_MAX;
4945 PyObject *result;
4946
Guido van Rossumb8872e62000-05-09 14:14:27 +00004947 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4948 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949 return NULL;
4950 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4951 (PyObject *)substring);
4952 if (substring == NULL)
4953 return NULL;
4954
4955 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4956
4957 Py_DECREF(substring);
4958 return result;
4959}
4960
4961static PyObject *
4962unicode_getitem(PyUnicodeObject *self, int index)
4963{
4964 if (index < 0 || index >= self->length) {
4965 PyErr_SetString(PyExc_IndexError, "string index out of range");
4966 return NULL;
4967 }
4968
4969 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4970}
4971
4972static long
4973unicode_hash(PyUnicodeObject *self)
4974{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004975 /* Since Unicode objects compare equal to their ASCII string
4976 counterparts, they should use the individual character values
4977 as basis for their hash value. This is needed to assure that
4978 strings and Unicode objects behave in the same way as
4979 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004980
Fredrik Lundhdde61642000-07-10 18:27:47 +00004981 register int len;
4982 register Py_UNICODE *p;
4983 register long x;
4984
Guido van Rossumd57fd912000-03-10 22:53:23 +00004985 if (self->hash != -1)
4986 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004987 len = PyUnicode_GET_SIZE(self);
4988 p = PyUnicode_AS_UNICODE(self);
4989 x = *p << 7;
4990 while (--len >= 0)
4991 x = (1000003*x) ^ *p++;
4992 x ^= PyUnicode_GET_SIZE(self);
4993 if (x == -1)
4994 x = -2;
4995 self->hash = x;
4996 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004997}
4998
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004999PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000"S.index(sub [,start [,end]]) -> int\n\
5001\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005002Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005003
5004static PyObject *
5005unicode_index(PyUnicodeObject *self, PyObject *args)
5006{
5007 int result;
5008 PyUnicodeObject *substring;
5009 int start = 0;
5010 int end = INT_MAX;
5011
Guido van Rossumb8872e62000-05-09 14:14:27 +00005012 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5013 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005014 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005015
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5017 (PyObject *)substring);
5018 if (substring == NULL)
5019 return NULL;
5020
5021 result = findstring(self, substring, start, end, 1);
5022
5023 Py_DECREF(substring);
5024 if (result < 0) {
5025 PyErr_SetString(PyExc_ValueError, "substring not found");
5026 return NULL;
5027 }
5028 return PyInt_FromLong(result);
5029}
5030
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005031PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005032"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005033\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005034Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005035at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036
5037static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005038unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005039{
5040 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5041 register const Py_UNICODE *e;
5042 int cased;
5043
Guido van Rossumd57fd912000-03-10 22:53:23 +00005044 /* Shortcut for single character strings */
5045 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005046 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005048 /* Special case for empty strings */
5049 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005050 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005051
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052 e = p + PyUnicode_GET_SIZE(self);
5053 cased = 0;
5054 for (; p < e; p++) {
5055 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005056
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005058 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059 else if (!cased && Py_UNICODE_ISLOWER(ch))
5060 cased = 1;
5061 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005062 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063}
5064
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005065PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005066"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005068Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005069at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070
5071static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005072unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073{
5074 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5075 register const Py_UNICODE *e;
5076 int cased;
5077
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078 /* Shortcut for single character strings */
5079 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005080 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005082 /* Special case for empty strings */
5083 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005084 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005085
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086 e = p + PyUnicode_GET_SIZE(self);
5087 cased = 0;
5088 for (; p < e; p++) {
5089 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005090
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005092 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005093 else if (!cased && Py_UNICODE_ISUPPER(ch))
5094 cased = 1;
5095 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005096 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097}
5098
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005099PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005100"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005102Return True if S is a titlecased string and there is at least one\n\
5103character in S, i.e. upper- and titlecase characters may only\n\
5104follow uncased characters and lowercase characters only cased ones.\n\
5105Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106
5107static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005108unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109{
5110 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5111 register const Py_UNICODE *e;
5112 int cased, previous_is_cased;
5113
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114 /* Shortcut for single character strings */
5115 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005116 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5117 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005119 /* Special case for empty strings */
5120 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005121 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005122
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123 e = p + PyUnicode_GET_SIZE(self);
5124 cased = 0;
5125 previous_is_cased = 0;
5126 for (; p < e; p++) {
5127 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005128
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5130 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005131 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132 previous_is_cased = 1;
5133 cased = 1;
5134 }
5135 else if (Py_UNICODE_ISLOWER(ch)) {
5136 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005137 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005138 previous_is_cased = 1;
5139 cased = 1;
5140 }
5141 else
5142 previous_is_cased = 0;
5143 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005144 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145}
5146
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005147PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005148"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005150Return True if all characters in S are whitespace\n\
5151and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152
5153static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005154unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155{
5156 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5157 register const Py_UNICODE *e;
5158
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159 /* Shortcut for single character strings */
5160 if (PyUnicode_GET_SIZE(self) == 1 &&
5161 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005162 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005164 /* Special case for empty strings */
5165 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005166 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005167
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168 e = p + PyUnicode_GET_SIZE(self);
5169 for (; p < e; p++) {
5170 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005171 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005173 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174}
5175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005176PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005177"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005178\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005179Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005180and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005181
5182static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005183unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005184{
5185 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5186 register const Py_UNICODE *e;
5187
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005188 /* Shortcut for single character strings */
5189 if (PyUnicode_GET_SIZE(self) == 1 &&
5190 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005191 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005192
5193 /* Special case for empty strings */
5194 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005195 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005196
5197 e = p + PyUnicode_GET_SIZE(self);
5198 for (; p < e; p++) {
5199 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005200 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005201 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005202 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005203}
5204
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005205PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005206"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005207\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005208Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005209and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005210
5211static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005212unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005213{
5214 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5215 register const Py_UNICODE *e;
5216
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005217 /* Shortcut for single character strings */
5218 if (PyUnicode_GET_SIZE(self) == 1 &&
5219 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005220 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005221
5222 /* Special case for empty strings */
5223 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005224 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005225
5226 e = p + PyUnicode_GET_SIZE(self);
5227 for (; p < e; p++) {
5228 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005229 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005230 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005231 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005232}
5233
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005234PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005235"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005237Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005238False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239
5240static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005241unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242{
5243 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5244 register const Py_UNICODE *e;
5245
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246 /* Shortcut for single character strings */
5247 if (PyUnicode_GET_SIZE(self) == 1 &&
5248 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005249 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005251 /* Special case for empty strings */
5252 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005253 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005254
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255 e = p + PyUnicode_GET_SIZE(self);
5256 for (; p < e; p++) {
5257 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005258 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005260 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261}
5262
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005263PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005264"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005266Return True if all characters in S are digits\n\
5267and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268
5269static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005270unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271{
5272 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5273 register const Py_UNICODE *e;
5274
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275 /* Shortcut for single character strings */
5276 if (PyUnicode_GET_SIZE(self) == 1 &&
5277 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005278 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005280 /* Special case for empty strings */
5281 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005282 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005283
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284 e = p + PyUnicode_GET_SIZE(self);
5285 for (; p < e; p++) {
5286 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005287 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005289 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290}
5291
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005292PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005293"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005295Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005296False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297
5298static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005299unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300{
5301 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5302 register const Py_UNICODE *e;
5303
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 /* Shortcut for single character strings */
5305 if (PyUnicode_GET_SIZE(self) == 1 &&
5306 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005307 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005309 /* Special case for empty strings */
5310 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005311 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005312
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 e = p + PyUnicode_GET_SIZE(self);
5314 for (; p < e; p++) {
5315 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005316 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005318 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319}
5320
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005321PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005322"S.join(sequence) -> unicode\n\
5323\n\
5324Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005325sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326
5327static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005328unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005330 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331}
5332
5333static int
5334unicode_length(PyUnicodeObject *self)
5335{
5336 return self->length;
5337}
5338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005339PyDoc_STRVAR(ljust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005340"S.ljust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341\n\
5342Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005343done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344
5345static PyObject *
5346unicode_ljust(PyUnicodeObject *self, PyObject *args)
5347{
5348 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005349 Py_UNICODE fillchar = ' ';
5350
5351 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352 return NULL;
5353
Tim Peters7a29bd52001-09-12 03:03:31 +00005354 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 Py_INCREF(self);
5356 return (PyObject*) self;
5357 }
5358
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005359 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360}
5361
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005362PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363"S.lower() -> unicode\n\
5364\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005365Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366
5367static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005368unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370 return fixup(self, fixlower);
5371}
5372
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005373#define LEFTSTRIP 0
5374#define RIGHTSTRIP 1
5375#define BOTHSTRIP 2
5376
5377/* Arrays indexed by above */
5378static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5379
5380#define STRIPNAME(i) (stripformat[i]+3)
5381
5382static const Py_UNICODE *
5383unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5384{
Tim Peters030a5ce2002-04-22 19:00:10 +00005385 size_t i;
5386 for (i = 0; i < n; ++i)
5387 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005388 return s+i;
5389 return NULL;
5390}
5391
5392/* externally visible for str.strip(unicode) */
5393PyObject *
5394_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5395{
5396 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5397 int len = PyUnicode_GET_SIZE(self);
5398 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5399 int seplen = PyUnicode_GET_SIZE(sepobj);
5400 int i, j;
5401
5402 i = 0;
5403 if (striptype != RIGHTSTRIP) {
5404 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5405 i++;
5406 }
5407 }
5408
5409 j = len;
5410 if (striptype != LEFTSTRIP) {
5411 do {
5412 j--;
5413 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5414 j++;
5415 }
5416
5417 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5418 Py_INCREF(self);
5419 return (PyObject*)self;
5420 }
5421 else
5422 return PyUnicode_FromUnicode(s+i, j-i);
5423}
5424
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425
5426static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005427do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005429 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5430 int len = PyUnicode_GET_SIZE(self), i, j;
5431
5432 i = 0;
5433 if (striptype != RIGHTSTRIP) {
5434 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5435 i++;
5436 }
5437 }
5438
5439 j = len;
5440 if (striptype != LEFTSTRIP) {
5441 do {
5442 j--;
5443 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5444 j++;
5445 }
5446
5447 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5448 Py_INCREF(self);
5449 return (PyObject*)self;
5450 }
5451 else
5452 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453}
5454
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005455
5456static PyObject *
5457do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5458{
5459 PyObject *sep = NULL;
5460
5461 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5462 return NULL;
5463
5464 if (sep != NULL && sep != Py_None) {
5465 if (PyUnicode_Check(sep))
5466 return _PyUnicode_XStrip(self, striptype, sep);
5467 else if (PyString_Check(sep)) {
5468 PyObject *res;
5469 sep = PyUnicode_FromObject(sep);
5470 if (sep==NULL)
5471 return NULL;
5472 res = _PyUnicode_XStrip(self, striptype, sep);
5473 Py_DECREF(sep);
5474 return res;
5475 }
5476 else {
5477 PyErr_Format(PyExc_TypeError,
5478 "%s arg must be None, unicode or str",
5479 STRIPNAME(striptype));
5480 return NULL;
5481 }
5482 }
5483
5484 return do_strip(self, striptype);
5485}
5486
5487
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005488PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005489"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005490\n\
5491Return a copy of the string S with leading and trailing\n\
5492whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005493If chars is given and not None, remove characters in chars instead.\n\
5494If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005495
5496static PyObject *
5497unicode_strip(PyUnicodeObject *self, PyObject *args)
5498{
5499 if (PyTuple_GET_SIZE(args) == 0)
5500 return do_strip(self, BOTHSTRIP); /* Common case */
5501 else
5502 return do_argstrip(self, BOTHSTRIP, args);
5503}
5504
5505
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005506PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005507"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005508\n\
5509Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005510If chars is given and not None, remove characters in chars instead.\n\
5511If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005512
5513static PyObject *
5514unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5515{
5516 if (PyTuple_GET_SIZE(args) == 0)
5517 return do_strip(self, LEFTSTRIP); /* Common case */
5518 else
5519 return do_argstrip(self, LEFTSTRIP, args);
5520}
5521
5522
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005523PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005524"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005525\n\
5526Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005527If chars is given and not None, remove characters in chars instead.\n\
5528If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005529
5530static PyObject *
5531unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5532{
5533 if (PyTuple_GET_SIZE(args) == 0)
5534 return do_strip(self, RIGHTSTRIP); /* Common case */
5535 else
5536 return do_argstrip(self, RIGHTSTRIP, args);
5537}
5538
5539
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540static PyObject*
5541unicode_repeat(PyUnicodeObject *str, int len)
5542{
5543 PyUnicodeObject *u;
5544 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005545 int nchars;
5546 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547
5548 if (len < 0)
5549 len = 0;
5550
Tim Peters7a29bd52001-09-12 03:03:31 +00005551 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 /* no repeat, return original string */
5553 Py_INCREF(str);
5554 return (PyObject*) str;
5555 }
Tim Peters8f422462000-09-09 06:13:41 +00005556
5557 /* ensure # of chars needed doesn't overflow int and # of bytes
5558 * needed doesn't overflow size_t
5559 */
5560 nchars = len * str->length;
5561 if (len && nchars / len != str->length) {
5562 PyErr_SetString(PyExc_OverflowError,
5563 "repeated string is too long");
5564 return NULL;
5565 }
5566 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5567 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5568 PyErr_SetString(PyExc_OverflowError,
5569 "repeated string is too long");
5570 return NULL;
5571 }
5572 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573 if (!u)
5574 return NULL;
5575
5576 p = u->str;
5577
5578 while (len-- > 0) {
5579 Py_UNICODE_COPY(p, str->str, str->length);
5580 p += str->length;
5581 }
5582
5583 return (PyObject*) u;
5584}
5585
5586PyObject *PyUnicode_Replace(PyObject *obj,
5587 PyObject *subobj,
5588 PyObject *replobj,
5589 int maxcount)
5590{
5591 PyObject *self;
5592 PyObject *str1;
5593 PyObject *str2;
5594 PyObject *result;
5595
5596 self = PyUnicode_FromObject(obj);
5597 if (self == NULL)
5598 return NULL;
5599 str1 = PyUnicode_FromObject(subobj);
5600 if (str1 == NULL) {
5601 Py_DECREF(self);
5602 return NULL;
5603 }
5604 str2 = PyUnicode_FromObject(replobj);
5605 if (str2 == NULL) {
5606 Py_DECREF(self);
5607 Py_DECREF(str1);
5608 return NULL;
5609 }
Tim Petersced69f82003-09-16 20:30:58 +00005610 result = replace((PyUnicodeObject *)self,
5611 (PyUnicodeObject *)str1,
5612 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613 maxcount);
5614 Py_DECREF(self);
5615 Py_DECREF(str1);
5616 Py_DECREF(str2);
5617 return result;
5618}
5619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005620PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621"S.replace (old, new[, maxsplit]) -> unicode\n\
5622\n\
5623Return a copy of S with all occurrences of substring\n\
5624old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005625given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626
5627static PyObject*
5628unicode_replace(PyUnicodeObject *self, PyObject *args)
5629{
5630 PyUnicodeObject *str1;
5631 PyUnicodeObject *str2;
5632 int maxcount = -1;
5633 PyObject *result;
5634
5635 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5636 return NULL;
5637 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5638 if (str1 == NULL)
5639 return NULL;
5640 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005641 if (str2 == NULL) {
5642 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005644 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645
5646 result = replace(self, str1, str2, maxcount);
5647
5648 Py_DECREF(str1);
5649 Py_DECREF(str2);
5650 return result;
5651}
5652
5653static
5654PyObject *unicode_repr(PyObject *unicode)
5655{
5656 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5657 PyUnicode_GET_SIZE(unicode),
5658 1);
5659}
5660
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005661PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662"S.rfind(sub [,start [,end]]) -> int\n\
5663\n\
5664Return the highest index in S where substring sub is found,\n\
5665such that sub is contained within s[start,end]. Optional\n\
5666arguments start and end are interpreted as in slice notation.\n\
5667\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005668Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669
5670static PyObject *
5671unicode_rfind(PyUnicodeObject *self, PyObject *args)
5672{
5673 PyUnicodeObject *substring;
5674 int start = 0;
5675 int end = INT_MAX;
5676 PyObject *result;
5677
Guido van Rossumb8872e62000-05-09 14:14:27 +00005678 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5679 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 return NULL;
5681 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5682 (PyObject *)substring);
5683 if (substring == NULL)
5684 return NULL;
5685
5686 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5687
5688 Py_DECREF(substring);
5689 return result;
5690}
5691
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005692PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693"S.rindex(sub [,start [,end]]) -> int\n\
5694\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005695Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696
5697static PyObject *
5698unicode_rindex(PyUnicodeObject *self, PyObject *args)
5699{
5700 int result;
5701 PyUnicodeObject *substring;
5702 int start = 0;
5703 int end = INT_MAX;
5704
Guido van Rossumb8872e62000-05-09 14:14:27 +00005705 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5706 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707 return NULL;
5708 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5709 (PyObject *)substring);
5710 if (substring == NULL)
5711 return NULL;
5712
5713 result = findstring(self, substring, start, end, -1);
5714
5715 Py_DECREF(substring);
5716 if (result < 0) {
5717 PyErr_SetString(PyExc_ValueError, "substring not found");
5718 return NULL;
5719 }
5720 return PyInt_FromLong(result);
5721}
5722
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005723PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005724"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725\n\
5726Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005727done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728
5729static PyObject *
5730unicode_rjust(PyUnicodeObject *self, PyObject *args)
5731{
5732 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005733 Py_UNICODE fillchar = ' ';
5734
5735 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 return NULL;
5737
Tim Peters7a29bd52001-09-12 03:03:31 +00005738 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739 Py_INCREF(self);
5740 return (PyObject*) self;
5741 }
5742
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005743 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744}
5745
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746static PyObject*
5747unicode_slice(PyUnicodeObject *self, int start, int end)
5748{
5749 /* standard clamping */
5750 if (start < 0)
5751 start = 0;
5752 if (end < 0)
5753 end = 0;
5754 if (end > self->length)
5755 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005756 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 /* full slice, return original string */
5758 Py_INCREF(self);
5759 return (PyObject*) self;
5760 }
5761 if (start > end)
5762 start = end;
5763 /* copy slice */
5764 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5765 end - start);
5766}
5767
5768PyObject *PyUnicode_Split(PyObject *s,
5769 PyObject *sep,
5770 int maxsplit)
5771{
5772 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005773
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774 s = PyUnicode_FromObject(s);
5775 if (s == NULL)
5776 return NULL;
5777 if (sep != NULL) {
5778 sep = PyUnicode_FromObject(sep);
5779 if (sep == NULL) {
5780 Py_DECREF(s);
5781 return NULL;
5782 }
5783 }
5784
5785 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5786
5787 Py_DECREF(s);
5788 Py_XDECREF(sep);
5789 return result;
5790}
5791
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005792PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793"S.split([sep [,maxsplit]]) -> list of strings\n\
5794\n\
5795Return a list of the words in S, using sep as the\n\
5796delimiter string. If maxsplit is given, at most maxsplit\n\
5797splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005798is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799
5800static PyObject*
5801unicode_split(PyUnicodeObject *self, PyObject *args)
5802{
5803 PyObject *substring = Py_None;
5804 int maxcount = -1;
5805
5806 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5807 return NULL;
5808
5809 if (substring == Py_None)
5810 return split(self, NULL, maxcount);
5811 else if (PyUnicode_Check(substring))
5812 return split(self, (PyUnicodeObject *)substring, maxcount);
5813 else
5814 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5815}
5816
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005817PyObject *PyUnicode_RSplit(PyObject *s,
5818 PyObject *sep,
5819 int maxsplit)
5820{
5821 PyObject *result;
5822
5823 s = PyUnicode_FromObject(s);
5824 if (s == NULL)
5825 return NULL;
5826 if (sep != NULL) {
5827 sep = PyUnicode_FromObject(sep);
5828 if (sep == NULL) {
5829 Py_DECREF(s);
5830 return NULL;
5831 }
5832 }
5833
5834 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5835
5836 Py_DECREF(s);
5837 Py_XDECREF(sep);
5838 return result;
5839}
5840
5841PyDoc_STRVAR(rsplit__doc__,
5842"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
5843\n\
5844Return a list of the words in S, using sep as the\n\
5845delimiter string, starting at the end of the string and\n\
5846working to the front. If maxsplit is given, at most maxsplit\n\
5847splits are done. If sep is not specified, any whitespace string\n\
5848is a separator.");
5849
5850static PyObject*
5851unicode_rsplit(PyUnicodeObject *self, PyObject *args)
5852{
5853 PyObject *substring = Py_None;
5854 int maxcount = -1;
5855
5856 if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
5857 return NULL;
5858
5859 if (substring == Py_None)
5860 return rsplit(self, NULL, maxcount);
5861 else if (PyUnicode_Check(substring))
5862 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
5863 else
5864 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
5865}
5866
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005867PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00005868"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869\n\
5870Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00005871Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005872is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873
5874static PyObject*
5875unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5876{
Guido van Rossum86662912000-04-11 15:38:46 +00005877 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878
Guido van Rossum86662912000-04-11 15:38:46 +00005879 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880 return NULL;
5881
Guido van Rossum86662912000-04-11 15:38:46 +00005882 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883}
5884
5885static
5886PyObject *unicode_str(PyUnicodeObject *self)
5887{
Fred Drakee4315f52000-05-09 19:53:39 +00005888 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889}
5890
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005891PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892"S.swapcase() -> unicode\n\
5893\n\
5894Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005895and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896
5897static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005898unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900 return fixup(self, fixswapcase);
5901}
5902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005903PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904"S.translate(table) -> unicode\n\
5905\n\
5906Return a copy of the string S, where all characters have been mapped\n\
5907through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00005908Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5909Unmapped characters are left untouched. Characters mapped to None\n\
5910are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911
5912static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005913unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914{
Tim Petersced69f82003-09-16 20:30:58 +00005915 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00005917 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 "ignore");
5919}
5920
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005921PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922"S.upper() -> unicode\n\
5923\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005924Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925
5926static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005927unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 return fixup(self, fixupper);
5930}
5931
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005932PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933"S.zfill(width) -> unicode\n\
5934\n\
5935Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005936of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937
5938static PyObject *
5939unicode_zfill(PyUnicodeObject *self, PyObject *args)
5940{
5941 int fill;
5942 PyUnicodeObject *u;
5943
5944 int width;
5945 if (!PyArg_ParseTuple(args, "i:zfill", &width))
5946 return NULL;
5947
5948 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00005949 if (PyUnicode_CheckExact(self)) {
5950 Py_INCREF(self);
5951 return (PyObject*) self;
5952 }
5953 else
5954 return PyUnicode_FromUnicode(
5955 PyUnicode_AS_UNICODE(self),
5956 PyUnicode_GET_SIZE(self)
5957 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 }
5959
5960 fill = width - self->length;
5961
5962 u = pad(self, fill, 0, '0');
5963
Walter Dörwald068325e2002-04-15 13:36:47 +00005964 if (u == NULL)
5965 return NULL;
5966
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 if (u->str[fill] == '+' || u->str[fill] == '-') {
5968 /* move sign to beginning of string */
5969 u->str[0] = u->str[fill];
5970 u->str[fill] = '0';
5971 }
5972
5973 return (PyObject*) u;
5974}
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975
5976#if 0
5977static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005978unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 return PyInt_FromLong(unicode_freelist_size);
5981}
5982#endif
5983
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005984PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005985"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00005987Return True if S starts with the specified prefix, False otherwise.\n\
5988With optional start, test S beginning at that position.\n\
5989With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990
5991static PyObject *
5992unicode_startswith(PyUnicodeObject *self,
5993 PyObject *args)
5994{
5995 PyUnicodeObject *substring;
5996 int start = 0;
5997 int end = INT_MAX;
5998 PyObject *result;
5999
Guido van Rossumb8872e62000-05-09 14:14:27 +00006000 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6001 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 return NULL;
6003 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6004 (PyObject *)substring);
6005 if (substring == NULL)
6006 return NULL;
6007
Guido van Rossum77f6a652002-04-03 22:41:51 +00006008 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009
6010 Py_DECREF(substring);
6011 return result;
6012}
6013
6014
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006015PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006016"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006018Return True if S ends with the specified suffix, False otherwise.\n\
6019With optional start, test S beginning at that position.\n\
6020With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021
6022static PyObject *
6023unicode_endswith(PyUnicodeObject *self,
6024 PyObject *args)
6025{
6026 PyUnicodeObject *substring;
6027 int start = 0;
6028 int end = INT_MAX;
6029 PyObject *result;
6030
Guido van Rossumb8872e62000-05-09 14:14:27 +00006031 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6032 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 return NULL;
6034 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6035 (PyObject *)substring);
6036 if (substring == NULL)
6037 return NULL;
6038
Guido van Rossum77f6a652002-04-03 22:41:51 +00006039 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040
6041 Py_DECREF(substring);
6042 return result;
6043}
6044
6045
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006046
6047static PyObject *
6048unicode_getnewargs(PyUnicodeObject *v)
6049{
6050 return Py_BuildValue("(u#)", v->str, v->length);
6051}
6052
6053
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054static PyMethodDef unicode_methods[] = {
6055
6056 /* Order is according to common usage: often used methods should
6057 appear first, since lookup is done sequentially. */
6058
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006059 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6060 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6061 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006062 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006063 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6064 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6065 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6066 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6067 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6068 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6069 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6070 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6071 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6072 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006073 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006074/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6075 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6076 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6077 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006078 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006079 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006080 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006081 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6082 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6083 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6084 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6085 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6086 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6087 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6088 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6089 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6090 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6091 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6092 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6093 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6094 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006095 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006096#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006097 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098#endif
6099
6100#if 0
6101 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006102 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103#endif
6104
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006105 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106 {NULL, NULL}
6107};
6108
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006109static PyObject *
6110unicode_mod(PyObject *v, PyObject *w)
6111{
6112 if (!PyUnicode_Check(v)) {
6113 Py_INCREF(Py_NotImplemented);
6114 return Py_NotImplemented;
6115 }
6116 return PyUnicode_Format(v, w);
6117}
6118
6119static PyNumberMethods unicode_as_number = {
6120 0, /*nb_add*/
6121 0, /*nb_subtract*/
6122 0, /*nb_multiply*/
6123 0, /*nb_divide*/
6124 unicode_mod, /*nb_remainder*/
6125};
6126
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127static PySequenceMethods unicode_as_sequence = {
6128 (inquiry) unicode_length, /* sq_length */
6129 (binaryfunc) PyUnicode_Concat, /* sq_concat */
6130 (intargfunc) unicode_repeat, /* sq_repeat */
6131 (intargfunc) unicode_getitem, /* sq_item */
6132 (intintargfunc) unicode_slice, /* sq_slice */
6133 0, /* sq_ass_item */
6134 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00006135 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136};
6137
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006138static PyObject*
6139unicode_subscript(PyUnicodeObject* self, PyObject* item)
6140{
6141 if (PyInt_Check(item)) {
6142 long i = PyInt_AS_LONG(item);
6143 if (i < 0)
6144 i += PyString_GET_SIZE(self);
6145 return unicode_getitem(self, i);
6146 } else if (PyLong_Check(item)) {
6147 long i = PyLong_AsLong(item);
6148 if (i == -1 && PyErr_Occurred())
6149 return NULL;
6150 if (i < 0)
6151 i += PyString_GET_SIZE(self);
6152 return unicode_getitem(self, i);
6153 } else if (PySlice_Check(item)) {
6154 int start, stop, step, slicelength, cur, i;
6155 Py_UNICODE* source_buf;
6156 Py_UNICODE* result_buf;
6157 PyObject* result;
6158
6159 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
6160 &start, &stop, &step, &slicelength) < 0) {
6161 return NULL;
6162 }
6163
6164 if (slicelength <= 0) {
6165 return PyUnicode_FromUnicode(NULL, 0);
6166 } else {
6167 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6168 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
6169
6170 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6171 result_buf[i] = source_buf[cur];
6172 }
Tim Petersced69f82003-09-16 20:30:58 +00006173
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006174 result = PyUnicode_FromUnicode(result_buf, slicelength);
6175 PyMem_FREE(result_buf);
6176 return result;
6177 }
6178 } else {
6179 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6180 return NULL;
6181 }
6182}
6183
6184static PyMappingMethods unicode_as_mapping = {
6185 (inquiry)unicode_length, /* mp_length */
6186 (binaryfunc)unicode_subscript, /* mp_subscript */
6187 (objobjargproc)0, /* mp_ass_subscript */
6188};
6189
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190static int
6191unicode_buffer_getreadbuf(PyUnicodeObject *self,
6192 int index,
6193 const void **ptr)
6194{
6195 if (index != 0) {
6196 PyErr_SetString(PyExc_SystemError,
6197 "accessing non-existent unicode segment");
6198 return -1;
6199 }
6200 *ptr = (void *) self->str;
6201 return PyUnicode_GET_DATA_SIZE(self);
6202}
6203
6204static int
6205unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6206 const void **ptr)
6207{
6208 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006209 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210 return -1;
6211}
6212
6213static int
6214unicode_buffer_getsegcount(PyUnicodeObject *self,
6215 int *lenp)
6216{
6217 if (lenp)
6218 *lenp = PyUnicode_GET_DATA_SIZE(self);
6219 return 1;
6220}
6221
6222static int
6223unicode_buffer_getcharbuf(PyUnicodeObject *self,
6224 int index,
6225 const void **ptr)
6226{
6227 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006228
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 if (index != 0) {
6230 PyErr_SetString(PyExc_SystemError,
6231 "accessing non-existent unicode segment");
6232 return -1;
6233 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006234 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 if (str == NULL)
6236 return -1;
6237 *ptr = (void *) PyString_AS_STRING(str);
6238 return PyString_GET_SIZE(str);
6239}
6240
6241/* Helpers for PyUnicode_Format() */
6242
6243static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006244getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245{
6246 int argidx = *p_argidx;
6247 if (argidx < arglen) {
6248 (*p_argidx)++;
6249 if (arglen < 0)
6250 return args;
6251 else
6252 return PyTuple_GetItem(args, argidx);
6253 }
6254 PyErr_SetString(PyExc_TypeError,
6255 "not enough arguments for format string");
6256 return NULL;
6257}
6258
6259#define F_LJUST (1<<0)
6260#define F_SIGN (1<<1)
6261#define F_BLANK (1<<2)
6262#define F_ALT (1<<3)
6263#define F_ZERO (1<<4)
6264
6265static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267{
6268 register int i;
6269 int len;
6270 va_list va;
6271 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273
6274 /* First, format the string as char array, then expand to Py_UNICODE
6275 array. */
6276 charbuffer = (char *)buffer;
6277 len = vsprintf(charbuffer, format, va);
6278 for (i = len - 1; i >= 0; i--)
6279 buffer[i] = (Py_UNICODE) charbuffer[i];
6280
6281 va_end(va);
6282 return len;
6283}
6284
Guido van Rossum078151d2002-08-11 04:24:12 +00006285/* XXX To save some code duplication, formatfloat/long/int could have been
6286 shared with stringobject.c, converting from 8-bit to Unicode after the
6287 formatting is done. */
6288
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289static int
6290formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006291 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292 int flags,
6293 int prec,
6294 int type,
6295 PyObject *v)
6296{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006297 /* fmt = '%#.' + `prec` + `type`
6298 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299 char fmt[20];
6300 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006301
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302 x = PyFloat_AsDouble(v);
6303 if (x == -1.0 && PyErr_Occurred())
6304 return -1;
6305 if (prec < 0)
6306 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6308 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006309 /* Worst case length calc to ensure no buffer overrun:
6310
6311 'g' formats:
6312 fmt = %#.<prec>g
6313 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6314 for any double rep.)
6315 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6316
6317 'f' formats:
6318 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6319 len = 1 + 50 + 1 + prec = 52 + prec
6320
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006321 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006322 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006323
6324 */
6325 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6326 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006327 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006328 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006329 return -1;
6330 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006331 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6332 (flags&F_ALT) ? "#" : "",
6333 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334 return usprintf(buf, fmt, x);
6335}
6336
Tim Peters38fd5b62000-09-21 05:43:11 +00006337static PyObject*
6338formatlong(PyObject *val, int flags, int prec, int type)
6339{
6340 char *buf;
6341 int i, len;
6342 PyObject *str; /* temporary string object. */
6343 PyUnicodeObject *result;
6344
6345 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6346 if (!str)
6347 return NULL;
6348 result = _PyUnicode_New(len);
6349 for (i = 0; i < len; i++)
6350 result->str[i] = buf[i];
6351 result->str[len] = 0;
6352 Py_DECREF(str);
6353 return (PyObject*)result;
6354}
6355
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356static int
6357formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006358 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359 int flags,
6360 int prec,
6361 int type,
6362 PyObject *v)
6363{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006364 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006365 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6366 * + 1 + 1
6367 * = 24
6368 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006369 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006370 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 long x;
6372
6373 x = PyInt_AsLong(v);
6374 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006375 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006376 if (x < 0 && type == 'u') {
6377 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006378 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006379 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6380 sign = "-";
6381 else
6382 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006384 prec = 1;
6385
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006386 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6387 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006388 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006389 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006390 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006391 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006392 return -1;
6393 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006394
6395 if ((flags & F_ALT) &&
6396 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006397 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006398 * of issues that cause pain:
6399 * - when 0 is being converted, the C standard leaves off
6400 * the '0x' or '0X', which is inconsistent with other
6401 * %#x/%#X conversions and inconsistent with Python's
6402 * hex() function
6403 * - there are platforms that violate the standard and
6404 * convert 0 with the '0x' or '0X'
6405 * (Metrowerks, Compaq Tru64)
6406 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006407 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006408 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006409 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006410 * We can achieve the desired consistency by inserting our
6411 * own '0x' or '0X' prefix, and substituting %x/%X in place
6412 * of %#x/%#X.
6413 *
6414 * Note that this is the same approach as used in
6415 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006416 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006417 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6418 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006419 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006420 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006421 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6422 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006423 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006424 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006425 if (sign[0])
6426 return usprintf(buf, fmt, -x);
6427 else
6428 return usprintf(buf, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429}
6430
6431static int
6432formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006433 size_t buflen,
6434 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006436 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006437 if (PyUnicode_Check(v)) {
6438 if (PyUnicode_GET_SIZE(v) != 1)
6439 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006441 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006443 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006444 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006445 goto onError;
6446 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6447 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448
6449 else {
6450 /* Integer input truncated to a character */
6451 long x;
6452 x = PyInt_AsLong(v);
6453 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006454 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006455#ifdef Py_UNICODE_WIDE
6456 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006457 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006458 "%c arg not in range(0x110000) "
6459 "(wide Python build)");
6460 return -1;
6461 }
6462#else
6463 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006464 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006465 "%c arg not in range(0x10000) "
6466 "(narrow Python build)");
6467 return -1;
6468 }
6469#endif
6470 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 }
6472 buf[1] = '\0';
6473 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006474
6475 onError:
6476 PyErr_SetString(PyExc_TypeError,
6477 "%c requires int or char");
6478 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479}
6480
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006481/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6482
6483 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6484 chars are formatted. XXX This is a magic number. Each formatting
6485 routine does bounds checking to ensure no overflow, but a better
6486 solution may be to malloc a buffer of appropriate size for each
6487 format. For now, the current solution is sufficient.
6488*/
6489#define FORMATBUFLEN (size_t)120
6490
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491PyObject *PyUnicode_Format(PyObject *format,
6492 PyObject *args)
6493{
6494 Py_UNICODE *fmt, *res;
6495 int fmtcnt, rescnt, reslen, arglen, argidx;
6496 int args_owned = 0;
6497 PyUnicodeObject *result = NULL;
6498 PyObject *dict = NULL;
6499 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006500
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501 if (format == NULL || args == NULL) {
6502 PyErr_BadInternalCall();
6503 return NULL;
6504 }
6505 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006506 if (uformat == NULL)
6507 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508 fmt = PyUnicode_AS_UNICODE(uformat);
6509 fmtcnt = PyUnicode_GET_SIZE(uformat);
6510
6511 reslen = rescnt = fmtcnt + 100;
6512 result = _PyUnicode_New(reslen);
6513 if (result == NULL)
6514 goto onError;
6515 res = PyUnicode_AS_UNICODE(result);
6516
6517 if (PyTuple_Check(args)) {
6518 arglen = PyTuple_Size(args);
6519 argidx = 0;
6520 }
6521 else {
6522 arglen = -1;
6523 argidx = -2;
6524 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006525 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6526 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527 dict = args;
6528
6529 while (--fmtcnt >= 0) {
6530 if (*fmt != '%') {
6531 if (--rescnt < 0) {
6532 rescnt = fmtcnt + 100;
6533 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006534 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535 return NULL;
6536 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6537 --rescnt;
6538 }
6539 *res++ = *fmt++;
6540 }
6541 else {
6542 /* Got a format specifier */
6543 int flags = 0;
6544 int width = -1;
6545 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 Py_UNICODE c = '\0';
6547 Py_UNICODE fill;
6548 PyObject *v = NULL;
6549 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006550 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 Py_UNICODE sign;
6552 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006553 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554
6555 fmt++;
6556 if (*fmt == '(') {
6557 Py_UNICODE *keystart;
6558 int keylen;
6559 PyObject *key;
6560 int pcount = 1;
6561
6562 if (dict == NULL) {
6563 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006564 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565 goto onError;
6566 }
6567 ++fmt;
6568 --fmtcnt;
6569 keystart = fmt;
6570 /* Skip over balanced parentheses */
6571 while (pcount > 0 && --fmtcnt >= 0) {
6572 if (*fmt == ')')
6573 --pcount;
6574 else if (*fmt == '(')
6575 ++pcount;
6576 fmt++;
6577 }
6578 keylen = fmt - keystart - 1;
6579 if (fmtcnt < 0 || pcount > 0) {
6580 PyErr_SetString(PyExc_ValueError,
6581 "incomplete format key");
6582 goto onError;
6583 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006584#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006585 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 then looked up since Python uses strings to hold
6587 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006588 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 key = PyUnicode_EncodeUTF8(keystart,
6590 keylen,
6591 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006592#else
6593 key = PyUnicode_FromUnicode(keystart, keylen);
6594#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 if (key == NULL)
6596 goto onError;
6597 if (args_owned) {
6598 Py_DECREF(args);
6599 args_owned = 0;
6600 }
6601 args = PyObject_GetItem(dict, key);
6602 Py_DECREF(key);
6603 if (args == NULL) {
6604 goto onError;
6605 }
6606 args_owned = 1;
6607 arglen = -1;
6608 argidx = -2;
6609 }
6610 while (--fmtcnt >= 0) {
6611 switch (c = *fmt++) {
6612 case '-': flags |= F_LJUST; continue;
6613 case '+': flags |= F_SIGN; continue;
6614 case ' ': flags |= F_BLANK; continue;
6615 case '#': flags |= F_ALT; continue;
6616 case '0': flags |= F_ZERO; continue;
6617 }
6618 break;
6619 }
6620 if (c == '*') {
6621 v = getnextarg(args, arglen, &argidx);
6622 if (v == NULL)
6623 goto onError;
6624 if (!PyInt_Check(v)) {
6625 PyErr_SetString(PyExc_TypeError,
6626 "* wants int");
6627 goto onError;
6628 }
6629 width = PyInt_AsLong(v);
6630 if (width < 0) {
6631 flags |= F_LJUST;
6632 width = -width;
6633 }
6634 if (--fmtcnt >= 0)
6635 c = *fmt++;
6636 }
6637 else if (c >= '0' && c <= '9') {
6638 width = c - '0';
6639 while (--fmtcnt >= 0) {
6640 c = *fmt++;
6641 if (c < '0' || c > '9')
6642 break;
6643 if ((width*10) / 10 != width) {
6644 PyErr_SetString(PyExc_ValueError,
6645 "width too big");
6646 goto onError;
6647 }
6648 width = width*10 + (c - '0');
6649 }
6650 }
6651 if (c == '.') {
6652 prec = 0;
6653 if (--fmtcnt >= 0)
6654 c = *fmt++;
6655 if (c == '*') {
6656 v = getnextarg(args, arglen, &argidx);
6657 if (v == NULL)
6658 goto onError;
6659 if (!PyInt_Check(v)) {
6660 PyErr_SetString(PyExc_TypeError,
6661 "* wants int");
6662 goto onError;
6663 }
6664 prec = PyInt_AsLong(v);
6665 if (prec < 0)
6666 prec = 0;
6667 if (--fmtcnt >= 0)
6668 c = *fmt++;
6669 }
6670 else if (c >= '0' && c <= '9') {
6671 prec = c - '0';
6672 while (--fmtcnt >= 0) {
6673 c = Py_CHARMASK(*fmt++);
6674 if (c < '0' || c > '9')
6675 break;
6676 if ((prec*10) / 10 != prec) {
6677 PyErr_SetString(PyExc_ValueError,
6678 "prec too big");
6679 goto onError;
6680 }
6681 prec = prec*10 + (c - '0');
6682 }
6683 }
6684 } /* prec */
6685 if (fmtcnt >= 0) {
6686 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687 if (--fmtcnt >= 0)
6688 c = *fmt++;
6689 }
6690 }
6691 if (fmtcnt < 0) {
6692 PyErr_SetString(PyExc_ValueError,
6693 "incomplete format");
6694 goto onError;
6695 }
6696 if (c != '%') {
6697 v = getnextarg(args, arglen, &argidx);
6698 if (v == NULL)
6699 goto onError;
6700 }
6701 sign = 0;
6702 fill = ' ';
6703 switch (c) {
6704
6705 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006706 pbuf = formatbuf;
6707 /* presume that buffer length is at least 1 */
6708 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709 len = 1;
6710 break;
6711
6712 case 's':
6713 case 'r':
6714 if (PyUnicode_Check(v) && c == 's') {
6715 temp = v;
6716 Py_INCREF(temp);
6717 }
6718 else {
6719 PyObject *unicode;
6720 if (c == 's')
6721 temp = PyObject_Str(v);
6722 else
6723 temp = PyObject_Repr(v);
6724 if (temp == NULL)
6725 goto onError;
6726 if (!PyString_Check(temp)) {
6727 /* XXX Note: this should never happen, since
6728 PyObject_Repr() and PyObject_Str() assure
6729 this */
6730 Py_DECREF(temp);
6731 PyErr_SetString(PyExc_TypeError,
6732 "%s argument has non-string str()");
6733 goto onError;
6734 }
Fred Drakee4315f52000-05-09 19:53:39 +00006735 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006737 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 "strict");
6739 Py_DECREF(temp);
6740 temp = unicode;
6741 if (temp == NULL)
6742 goto onError;
6743 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006744 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745 len = PyUnicode_GET_SIZE(temp);
6746 if (prec >= 0 && len > prec)
6747 len = prec;
6748 break;
6749
6750 case 'i':
6751 case 'd':
6752 case 'u':
6753 case 'o':
6754 case 'x':
6755 case 'X':
6756 if (c == 'i')
6757 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006758 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006759 temp = formatlong(v, flags, prec, c);
6760 if (!temp)
6761 goto onError;
6762 pbuf = PyUnicode_AS_UNICODE(temp);
6763 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00006764 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006766 else {
6767 pbuf = formatbuf;
6768 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6769 flags, prec, c, v);
6770 if (len < 0)
6771 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006772 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006773 }
6774 if (flags & F_ZERO)
6775 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 break;
6777
6778 case 'e':
6779 case 'E':
6780 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006781 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782 case 'g':
6783 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006784 if (c == 'F')
6785 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006786 pbuf = formatbuf;
6787 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6788 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789 if (len < 0)
6790 goto onError;
6791 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006792 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793 fill = '0';
6794 break;
6795
6796 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006797 pbuf = formatbuf;
6798 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799 if (len < 0)
6800 goto onError;
6801 break;
6802
6803 default:
6804 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006805 "unsupported format character '%c' (0x%x) "
6806 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00006807 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006808 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006809 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810 goto onError;
6811 }
6812 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006813 if (*pbuf == '-' || *pbuf == '+') {
6814 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815 len--;
6816 }
6817 else if (flags & F_SIGN)
6818 sign = '+';
6819 else if (flags & F_BLANK)
6820 sign = ' ';
6821 else
6822 sign = 0;
6823 }
6824 if (width < len)
6825 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006826 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827 reslen -= rescnt;
6828 rescnt = width + fmtcnt + 100;
6829 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006830 if (reslen < 0) {
6831 Py_DECREF(result);
6832 return PyErr_NoMemory();
6833 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006834 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835 return NULL;
6836 res = PyUnicode_AS_UNICODE(result)
6837 + reslen - rescnt;
6838 }
6839 if (sign) {
6840 if (fill != ' ')
6841 *res++ = sign;
6842 rescnt--;
6843 if (width > len)
6844 width--;
6845 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006846 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6847 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006848 assert(pbuf[1] == c);
6849 if (fill != ' ') {
6850 *res++ = *pbuf++;
6851 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00006852 }
Tim Petersfff53252001-04-12 18:38:48 +00006853 rescnt -= 2;
6854 width -= 2;
6855 if (width < 0)
6856 width = 0;
6857 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00006858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859 if (width > len && !(flags & F_LJUST)) {
6860 do {
6861 --rescnt;
6862 *res++ = fill;
6863 } while (--width > len);
6864 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006865 if (fill == ' ') {
6866 if (sign)
6867 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00006868 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006869 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006870 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00006871 *res++ = *pbuf++;
6872 *res++ = *pbuf++;
6873 }
6874 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006875 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876 res += len;
6877 rescnt -= len;
6878 while (--width >= len) {
6879 --rescnt;
6880 *res++ = ' ';
6881 }
6882 if (dict && (argidx < arglen) && c != '%') {
6883 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006884 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885 goto onError;
6886 }
6887 Py_XDECREF(temp);
6888 } /* '%' */
6889 } /* until end */
6890 if (argidx < arglen && !dict) {
6891 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006892 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 goto onError;
6894 }
6895
6896 if (args_owned) {
6897 Py_DECREF(args);
6898 }
6899 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00006900 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006901 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902 return (PyObject *)result;
6903
6904 onError:
6905 Py_XDECREF(result);
6906 Py_DECREF(uformat);
6907 if (args_owned) {
6908 Py_DECREF(args);
6909 }
6910 return NULL;
6911}
6912
6913static PyBufferProcs unicode_as_buffer = {
6914 (getreadbufferproc) unicode_buffer_getreadbuf,
6915 (getwritebufferproc) unicode_buffer_getwritebuf,
6916 (getsegcountproc) unicode_buffer_getsegcount,
6917 (getcharbufferproc) unicode_buffer_getcharbuf,
6918};
6919
Jeremy Hylton938ace62002-07-17 16:30:39 +00006920static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00006921unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6922
Tim Peters6d6c1a32001-08-02 04:15:00 +00006923static PyObject *
6924unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6925{
6926 PyObject *x = NULL;
6927 static char *kwlist[] = {"string", "encoding", "errors", 0};
6928 char *encoding = NULL;
6929 char *errors = NULL;
6930
Guido van Rossume023fe02001-08-30 03:12:59 +00006931 if (type != &PyUnicode_Type)
6932 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00006933 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6934 kwlist, &x, &encoding, &errors))
6935 return NULL;
6936 if (x == NULL)
6937 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00006938 if (encoding == NULL && errors == NULL)
6939 return PyObject_Unicode(x);
6940 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00006941 return PyUnicode_FromEncodedObject(x, encoding, errors);
6942}
6943
Guido van Rossume023fe02001-08-30 03:12:59 +00006944static PyObject *
6945unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6946{
Tim Petersaf90b3e2001-09-12 05:18:58 +00006947 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006948 int n;
6949
6950 assert(PyType_IsSubtype(type, &PyUnicode_Type));
6951 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6952 if (tmp == NULL)
6953 return NULL;
6954 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00006955 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006956 if (pnew == NULL) {
6957 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00006958 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00006959 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006960 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6961 if (pnew->str == NULL) {
6962 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006963 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006964 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00006965 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00006966 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006967 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6968 pnew->length = n;
6969 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00006970 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00006971 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006972}
6973
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006974PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00006975"unicode(string [, encoding[, errors]]) -> object\n\
6976\n\
6977Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00006978encoding defaults to the current default string encoding.\n\
6979errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00006980
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981PyTypeObject PyUnicode_Type = {
6982 PyObject_HEAD_INIT(&PyType_Type)
6983 0, /* ob_size */
6984 "unicode", /* tp_name */
6985 sizeof(PyUnicodeObject), /* tp_size */
6986 0, /* tp_itemsize */
6987 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00006988 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006990 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991 0, /* tp_setattr */
6992 (cmpfunc) unicode_compare, /* tp_compare */
6993 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006994 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006996 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997 (hashfunc) unicode_hash, /* tp_hash*/
6998 0, /* tp_call*/
6999 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007000 PyObject_GenericGetAttr, /* tp_getattro */
7001 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007003 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7004 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007005 unicode_doc, /* tp_doc */
7006 0, /* tp_traverse */
7007 0, /* tp_clear */
7008 0, /* tp_richcompare */
7009 0, /* tp_weaklistoffset */
7010 0, /* tp_iter */
7011 0, /* tp_iternext */
7012 unicode_methods, /* tp_methods */
7013 0, /* tp_members */
7014 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007015 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007016 0, /* tp_dict */
7017 0, /* tp_descr_get */
7018 0, /* tp_descr_set */
7019 0, /* tp_dictoffset */
7020 0, /* tp_init */
7021 0, /* tp_alloc */
7022 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007023 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024};
7025
7026/* Initialize the Unicode implementation */
7027
Thomas Wouters78890102000-07-22 19:25:51 +00007028void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007030 int i;
7031
Fred Drakee4315f52000-05-09 19:53:39 +00007032 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007033 unicode_freelist = NULL;
7034 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007036 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007037 for (i = 0; i < 256; i++)
7038 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007039 if (PyType_Ready(&PyUnicode_Type) < 0)
7040 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041}
7042
7043/* Finalize the Unicode implementation */
7044
7045void
Thomas Wouters78890102000-07-22 19:25:51 +00007046_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007048 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007049 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007051 Py_XDECREF(unicode_empty);
7052 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007053
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007054 for (i = 0; i < 256; i++) {
7055 if (unicode_latin1[i]) {
7056 Py_DECREF(unicode_latin1[i]);
7057 unicode_latin1[i] = NULL;
7058 }
7059 }
7060
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007061 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062 PyUnicodeObject *v = u;
7063 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007064 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007065 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007066 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007067 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007069 unicode_freelist = NULL;
7070 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007072
7073/*
7074Local variables:
7075c-basic-offset: 4
7076indent-tabs-mode: nil
7077End:
7078*/