blob: 45fb966281a4f4560a59756b4ede30126c6ce17f [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
157 unicode->length = length;
158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000266int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
371 for (i = size; i >= 0; i--)
372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
387 if (size > PyUnicode_GET_SIZE(unicode))
388 size = PyUnicode_GET_SIZE(unicode);
389#ifdef HAVE_USABLE_WCHAR_T
390 memcpy(w, unicode->str, size * sizeof(wchar_t));
391#else
392 {
393 register Py_UNICODE *u;
394 register int i;
395 u = PyUnicode_AS_UNICODE(unicode);
396 for (i = size; i >= 0; i--)
397 *w++ = *u++;
398 }
399#endif
400
401 return size;
402}
403
404#endif
405
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000406PyObject *PyUnicode_FromOrdinal(int ordinal)
407{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000408 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000409
410#ifdef Py_UNICODE_WIDE
411 if (ordinal < 0 || ordinal > 0x10ffff) {
412 PyErr_SetString(PyExc_ValueError,
413 "unichr() arg not in range(0x110000) "
414 "(wide Python build)");
415 return NULL;
416 }
417#else
418 if (ordinal < 0 || ordinal > 0xffff) {
419 PyErr_SetString(PyExc_ValueError,
420 "unichr() arg not in range(0x10000) "
421 "(narrow Python build)");
422 return NULL;
423 }
424#endif
425
Hye-Shik Chang40574832004-04-06 07:24:51 +0000426 s[0] = (Py_UNICODE)ordinal;
427 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000428}
429
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430PyObject *PyUnicode_FromObject(register PyObject *obj)
431{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000432 /* XXX Perhaps we should make this API an alias of
433 PyObject_Unicode() instead ?! */
434 if (PyUnicode_CheckExact(obj)) {
435 Py_INCREF(obj);
436 return obj;
437 }
438 if (PyUnicode_Check(obj)) {
439 /* For a Unicode subtype that's not a Unicode object,
440 return a true Unicode object with the same data. */
441 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
442 PyUnicode_GET_SIZE(obj));
443 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000444 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
445}
446
447PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
448 const char *encoding,
449 const char *errors)
450{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000451 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000453 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000454
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455 if (obj == NULL) {
456 PyErr_BadInternalCall();
457 return NULL;
458 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000460#if 0
461 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000462 that no encodings is given and then redirect to
463 PyObject_Unicode() which then applies the additional logic for
464 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000465
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000466 NOTE: This API should really only be used for object which
467 represent *encoded* Unicode !
468
469 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000470 if (PyUnicode_Check(obj)) {
471 if (encoding) {
472 PyErr_SetString(PyExc_TypeError,
473 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000474 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000475 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000476 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000477 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000478#else
479 if (PyUnicode_Check(obj)) {
480 PyErr_SetString(PyExc_TypeError,
481 "decoding Unicode is not supported");
482 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000484#endif
485
486 /* Coerce object */
487 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000488 s = PyString_AS_STRING(obj);
489 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000490 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
492 /* Overwrite the error message with something more useful in
493 case of a TypeError. */
494 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000495 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000496 "coercing to Unicode: need string or buffer, "
497 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000498 obj->ob_type->tp_name);
499 goto onError;
500 }
Tim Petersced69f82003-09-16 20:30:58 +0000501
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000502 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 if (len == 0) {
504 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000505 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000509
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000510 return v;
511
512 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514}
515
516PyObject *PyUnicode_Decode(const char *s,
517 int size,
518 const char *encoding,
519 const char *errors)
520{
521 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000522
523 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000524 encoding = PyUnicode_GetDefaultEncoding();
525
526 /* Shortcuts for common default encodings */
527 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000529 else if (strcmp(encoding, "latin-1") == 0)
530 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000531#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
532 else if (strcmp(encoding, "mbcs") == 0)
533 return PyUnicode_DecodeMBCS(s, size, errors);
534#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000535 else if (strcmp(encoding, "ascii") == 0)
536 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000537
538 /* Decode via the codec registry */
539 buffer = PyBuffer_FromMemory((void *)s, size);
540 if (buffer == NULL)
541 goto onError;
542 unicode = PyCodec_Decode(buffer, encoding, errors);
543 if (unicode == NULL)
544 goto onError;
545 if (!PyUnicode_Check(unicode)) {
546 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000547 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548 unicode->ob_type->tp_name);
549 Py_DECREF(unicode);
550 goto onError;
551 }
552 Py_DECREF(buffer);
553 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000554
Guido van Rossumd57fd912000-03-10 22:53:23 +0000555 onError:
556 Py_XDECREF(buffer);
557 return NULL;
558}
559
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000560PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
561 const char *encoding,
562 const char *errors)
563{
564 PyObject *v;
565
566 if (!PyUnicode_Check(unicode)) {
567 PyErr_BadArgument();
568 goto onError;
569 }
570
571 if (encoding == NULL)
572 encoding = PyUnicode_GetDefaultEncoding();
573
574 /* Decode via the codec registry */
575 v = PyCodec_Decode(unicode, encoding, errors);
576 if (v == NULL)
577 goto onError;
578 return v;
579
580 onError:
581 return NULL;
582}
583
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584PyObject *PyUnicode_Encode(const Py_UNICODE *s,
585 int size,
586 const char *encoding,
587 const char *errors)
588{
589 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000590
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591 unicode = PyUnicode_FromUnicode(s, size);
592 if (unicode == NULL)
593 return NULL;
594 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
595 Py_DECREF(unicode);
596 return v;
597}
598
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000599PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
600 const char *encoding,
601 const char *errors)
602{
603 PyObject *v;
604
605 if (!PyUnicode_Check(unicode)) {
606 PyErr_BadArgument();
607 goto onError;
608 }
609
610 if (encoding == NULL)
611 encoding = PyUnicode_GetDefaultEncoding();
612
613 /* Encode via the codec registry */
614 v = PyCodec_Encode(unicode, encoding, errors);
615 if (v == NULL)
616 goto onError;
617 return v;
618
619 onError:
620 return NULL;
621}
622
Guido van Rossumd57fd912000-03-10 22:53:23 +0000623PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
624 const char *encoding,
625 const char *errors)
626{
627 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629 if (!PyUnicode_Check(unicode)) {
630 PyErr_BadArgument();
631 goto onError;
632 }
Fred Drakee4315f52000-05-09 19:53:39 +0000633
Tim Petersced69f82003-09-16 20:30:58 +0000634 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000635 encoding = PyUnicode_GetDefaultEncoding();
636
637 /* Shortcuts for common default encodings */
638 if (errors == NULL) {
639 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000640 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000641 else if (strcmp(encoding, "latin-1") == 0)
642 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000643#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
644 else if (strcmp(encoding, "mbcs") == 0)
645 return PyUnicode_AsMBCSString(unicode);
646#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000647 else if (strcmp(encoding, "ascii") == 0)
648 return PyUnicode_AsASCIIString(unicode);
649 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650
651 /* Encode via the codec registry */
652 v = PyCodec_Encode(unicode, encoding, errors);
653 if (v == NULL)
654 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 if (!PyString_Check(v)) {
656 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000657 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 v->ob_type->tp_name);
659 Py_DECREF(v);
660 goto onError;
661 }
662 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000663
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 onError:
665 return NULL;
666}
667
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000668PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
669 const char *errors)
670{
671 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
672
673 if (v)
674 return v;
675 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
676 if (v && errors == NULL)
677 ((PyUnicodeObject *)unicode)->defenc = v;
678 return v;
679}
680
Guido van Rossumd57fd912000-03-10 22:53:23 +0000681Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
682{
683 if (!PyUnicode_Check(unicode)) {
684 PyErr_BadArgument();
685 goto onError;
686 }
687 return PyUnicode_AS_UNICODE(unicode);
688
689 onError:
690 return NULL;
691}
692
693int PyUnicode_GetSize(PyObject *unicode)
694{
695 if (!PyUnicode_Check(unicode)) {
696 PyErr_BadArgument();
697 goto onError;
698 }
699 return PyUnicode_GET_SIZE(unicode);
700
701 onError:
702 return -1;
703}
704
Thomas Wouters78890102000-07-22 19:25:51 +0000705const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000706{
707 return unicode_default_encoding;
708}
709
710int PyUnicode_SetDefaultEncoding(const char *encoding)
711{
712 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000713
Fred Drakee4315f52000-05-09 19:53:39 +0000714 /* Make sure the encoding is valid. As side effect, this also
715 loads the encoding into the codec registry cache. */
716 v = _PyCodec_Lookup(encoding);
717 if (v == NULL)
718 goto onError;
719 Py_DECREF(v);
720 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000721 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000722 sizeof(unicode_default_encoding));
723 return 0;
724
725 onError:
726 return -1;
727}
728
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000729/* error handling callback helper:
730 build arguments, call the callback and check the arguments,
731 if no exception occured, copy the replacement to the output
732 and adjust various state variables.
733 return 0 on success, -1 on error
734*/
735
736static
737int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
738 const char *encoding, const char *reason,
739 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
740 PyObject **output, int *outpos, Py_UNICODE **outptr)
741{
742 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
743
744 PyObject *restuple = NULL;
745 PyObject *repunicode = NULL;
746 int outsize = PyUnicode_GET_SIZE(*output);
747 int requiredsize;
748 int newpos;
749 Py_UNICODE *repptr;
750 int repsize;
751 int res = -1;
752
753 if (*errorHandler == NULL) {
754 *errorHandler = PyCodec_LookupError(errors);
755 if (*errorHandler == NULL)
756 goto onError;
757 }
758
759 if (*exceptionObject == NULL) {
760 *exceptionObject = PyUnicodeDecodeError_Create(
761 encoding, input, insize, *startinpos, *endinpos, reason);
762 if (*exceptionObject == NULL)
763 goto onError;
764 }
765 else {
766 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
767 goto onError;
768 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
769 goto onError;
770 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
771 goto onError;
772 }
773
774 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
775 if (restuple == NULL)
776 goto onError;
777 if (!PyTuple_Check(restuple)) {
778 PyErr_Format(PyExc_TypeError, &argparse[4]);
779 goto onError;
780 }
781 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
782 goto onError;
783 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000784 newpos = insize+newpos;
785 if (newpos<0 || newpos>insize) {
786 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
787 goto onError;
788 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000789
790 /* need more space? (at least enough for what we
791 have+the replacement+the rest of the string (starting
792 at the new input position), so we won't have to check space
793 when there are no errors in the rest of the string) */
794 repptr = PyUnicode_AS_UNICODE(repunicode);
795 repsize = PyUnicode_GET_SIZE(repunicode);
796 requiredsize = *outpos + repsize + insize-newpos;
797 if (requiredsize > outsize) {
798 if (requiredsize<2*outsize)
799 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000800 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000801 goto onError;
802 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
803 }
804 *endinpos = newpos;
805 *inptr = input + newpos;
806 Py_UNICODE_COPY(*outptr, repptr, repsize);
807 *outptr += repsize;
808 *outpos += repsize;
809 /* we made it! */
810 res = 0;
811
812 onError:
813 Py_XDECREF(restuple);
814 return res;
815}
816
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000817/* --- UTF-7 Codec -------------------------------------------------------- */
818
819/* see RFC2152 for details */
820
Tim Petersced69f82003-09-16 20:30:58 +0000821static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000822char utf7_special[128] = {
823 /* indicate whether a UTF-7 character is special i.e. cannot be directly
824 encoded:
825 0 - not special
826 1 - special
827 2 - whitespace (optional)
828 3 - RFC2152 Set O (optional) */
829 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
830 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
831 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
832 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
833 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
834 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
835 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
836 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
837
838};
839
840#define SPECIAL(c, encodeO, encodeWS) \
841 (((c)>127 || utf7_special[(c)] == 1) || \
842 (encodeWS && (utf7_special[(c)] == 2)) || \
843 (encodeO && (utf7_special[(c)] == 3)))
844
845#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
846#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
847#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
848 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
849
850#define ENCODE(out, ch, bits) \
851 while (bits >= 6) { \
852 *out++ = B64(ch >> (bits-6)); \
853 bits -= 6; \
854 }
855
856#define DECODE(out, ch, bits, surrogate) \
857 while (bits >= 16) { \
858 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
859 bits -= 16; \
860 if (surrogate) { \
861 /* We have already generated an error for the high surrogate
862 so let's not bother seeing if the low surrogate is correct or not */\
863 surrogate = 0; \
864 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
865 /* This is a surrogate pair. Unfortunately we can't represent \
866 it in a 16-bit character */ \
867 surrogate = 1; \
868 errmsg = "code pairs are not supported"; \
869 goto utf7Error; \
870 } else { \
871 *out++ = outCh; \
872 } \
873 } \
874
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000875PyObject *PyUnicode_DecodeUTF7(const char *s,
876 int size,
877 const char *errors)
878{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000879 const char *starts = s;
880 int startinpos;
881 int endinpos;
882 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000883 const char *e;
884 PyUnicodeObject *unicode;
885 Py_UNICODE *p;
886 const char *errmsg = "";
887 int inShift = 0;
888 unsigned int bitsleft = 0;
889 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000890 int surrogate = 0;
891 PyObject *errorHandler = NULL;
892 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000893
894 unicode = _PyUnicode_New(size);
895 if (!unicode)
896 return NULL;
897 if (size == 0)
898 return (PyObject *)unicode;
899
900 p = unicode->str;
901 e = s + size;
902
903 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000904 Py_UNICODE ch;
905 restart:
906 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907
908 if (inShift) {
909 if ((ch == '-') || !B64CHAR(ch)) {
910 inShift = 0;
911 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000912
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000913 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
914 if (bitsleft >= 6) {
915 /* The shift sequence has a partial character in it. If
916 bitsleft < 6 then we could just classify it as padding
917 but that is not the case here */
918
919 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000920 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921 }
922 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000923 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000924 here so indicate the potential of a misencoded character. */
925
926 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
927 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
928 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000929 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000930 }
931
932 if (ch == '-') {
933 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000934 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000935 inShift = 1;
936 }
937 } else if (SPECIAL(ch,0,0)) {
938 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000939 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000940 } else {
941 *p++ = ch;
942 }
943 } else {
944 charsleft = (charsleft << 6) | UB64(ch);
945 bitsleft += 6;
946 s++;
947 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
948 }
949 }
950 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000951 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000952 s++;
953 if (s < e && *s == '-') {
954 s++;
955 *p++ = '+';
956 } else
957 {
958 inShift = 1;
959 bitsleft = 0;
960 }
961 }
962 else if (SPECIAL(ch,0,0)) {
963 errmsg = "unexpected special character";
964 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000965 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000966 }
967 else {
968 *p++ = ch;
969 s++;
970 }
971 continue;
972 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000973 outpos = p-PyUnicode_AS_UNICODE(unicode);
974 endinpos = s-starts;
975 if (unicode_decode_call_errorhandler(
976 errors, &errorHandler,
977 "utf7", errmsg,
978 starts, size, &startinpos, &endinpos, &exc, &s,
979 (PyObject **)&unicode, &outpos, &p))
980 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000981 }
982
983 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000984 outpos = p-PyUnicode_AS_UNICODE(unicode);
985 endinpos = size;
986 if (unicode_decode_call_errorhandler(
987 errors, &errorHandler,
988 "utf7", "unterminated shift sequence",
989 starts, size, &startinpos, &endinpos, &exc, &s,
990 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000991 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000992 if (s < e)
993 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000994 }
995
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000996 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 goto onError;
998
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000999 Py_XDECREF(errorHandler);
1000 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001001 return (PyObject *)unicode;
1002
1003onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001004 Py_XDECREF(errorHandler);
1005 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001006 Py_DECREF(unicode);
1007 return NULL;
1008}
1009
1010
1011PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1012 int size,
1013 int encodeSetO,
1014 int encodeWhiteSpace,
1015 const char *errors)
1016{
1017 PyObject *v;
1018 /* It might be possible to tighten this worst case */
1019 unsigned int cbAllocated = 5 * size;
1020 int inShift = 0;
1021 int i = 0;
1022 unsigned int bitsleft = 0;
1023 unsigned long charsleft = 0;
1024 char * out;
1025 char * start;
1026
1027 if (size == 0)
1028 return PyString_FromStringAndSize(NULL, 0);
1029
1030 v = PyString_FromStringAndSize(NULL, cbAllocated);
1031 if (v == NULL)
1032 return NULL;
1033
1034 start = out = PyString_AS_STRING(v);
1035 for (;i < size; ++i) {
1036 Py_UNICODE ch = s[i];
1037
1038 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001039 if (ch == '+') {
1040 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001041 *out++ = '-';
1042 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1043 charsleft = ch;
1044 bitsleft = 16;
1045 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001046 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001047 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001048 } else {
1049 *out++ = (char) ch;
1050 }
1051 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001052 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1053 *out++ = B64(charsleft << (6-bitsleft));
1054 charsleft = 0;
1055 bitsleft = 0;
1056 /* Characters not in the BASE64 set implicitly unshift the sequence
1057 so no '-' is required, except if the character is itself a '-' */
1058 if (B64CHAR(ch) || ch == '-') {
1059 *out++ = '-';
1060 }
1061 inShift = 0;
1062 *out++ = (char) ch;
1063 } else {
1064 bitsleft += 16;
1065 charsleft = (charsleft << 16) | ch;
1066 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1067
1068 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001069 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001070 or '-' then the shift sequence will be terminated implicitly and we
1071 don't have to insert a '-'. */
1072
1073 if (bitsleft == 0) {
1074 if (i + 1 < size) {
1075 Py_UNICODE ch2 = s[i+1];
1076
1077 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001078
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001079 } else if (B64CHAR(ch2) || ch2 == '-') {
1080 *out++ = '-';
1081 inShift = 0;
1082 } else {
1083 inShift = 0;
1084 }
1085
1086 }
1087 else {
1088 *out++ = '-';
1089 inShift = 0;
1090 }
1091 }
Tim Petersced69f82003-09-16 20:30:58 +00001092 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001093 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001094 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001095 if (bitsleft) {
1096 *out++= B64(charsleft << (6-bitsleft) );
1097 *out++ = '-';
1098 }
1099
Tim Peters5de98422002-04-27 18:44:32 +00001100 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001101 return v;
1102}
1103
1104#undef SPECIAL
1105#undef B64
1106#undef B64CHAR
1107#undef UB64
1108#undef ENCODE
1109#undef DECODE
1110
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111/* --- UTF-8 Codec -------------------------------------------------------- */
1112
Tim Petersced69f82003-09-16 20:30:58 +00001113static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114char utf8_code_length[256] = {
1115 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1116 illegal prefix. see RFC 2279 for details */
1117 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1118 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1119 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1120 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1121 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1122 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1124 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1126 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1129 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1130 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1131 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1132 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1133};
1134
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135PyObject *PyUnicode_DecodeUTF8(const char *s,
1136 int size,
1137 const char *errors)
1138{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001139 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001141 int startinpos;
1142 int endinpos;
1143 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144 const char *e;
1145 PyUnicodeObject *unicode;
1146 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001147 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001148 PyObject *errorHandler = NULL;
1149 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150
1151 /* Note: size will always be longer than the resulting Unicode
1152 character count */
1153 unicode = _PyUnicode_New(size);
1154 if (!unicode)
1155 return NULL;
1156 if (size == 0)
1157 return (PyObject *)unicode;
1158
1159 /* Unpack UTF-8 encoded data */
1160 p = unicode->str;
1161 e = s + size;
1162
1163 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001164 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165
1166 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001167 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 s++;
1169 continue;
1170 }
1171
1172 n = utf8_code_length[ch];
1173
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001174 if (s + n > e) {
1175 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001176 startinpos = s-starts;
1177 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001178 goto utf8Error;
1179 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180
1181 switch (n) {
1182
1183 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001184 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001185 startinpos = s-starts;
1186 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001187 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188
1189 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001190 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001191 startinpos = s-starts;
1192 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001193 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194
1195 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001196 if ((s[1] & 0xc0) != 0x80) {
1197 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001198 startinpos = s-starts;
1199 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001200 goto utf8Error;
1201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001203 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001204 startinpos = s-starts;
1205 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001206 errmsg = "illegal encoding";
1207 goto utf8Error;
1208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001210 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 break;
1212
1213 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001214 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001215 (s[2] & 0xc0) != 0x80) {
1216 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001217 startinpos = s-starts;
1218 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001219 goto utf8Error;
1220 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001222 if (ch < 0x0800) {
1223 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001224 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001225
1226 XXX For wide builds (UCS-4) we should probably try
1227 to recombine the surrogates into a single code
1228 unit.
1229 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001230 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001231 startinpos = s-starts;
1232 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001233 goto utf8Error;
1234 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001236 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001237 break;
1238
1239 case 4:
1240 if ((s[1] & 0xc0) != 0x80 ||
1241 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001242 (s[3] & 0xc0) != 0x80) {
1243 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001244 startinpos = s-starts;
1245 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001246 goto utf8Error;
1247 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001248 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1249 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1250 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001251 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001252 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001253 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001254 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001255 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001256 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001257 startinpos = s-starts;
1258 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001259 goto utf8Error;
1260 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001261#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001262 *p++ = (Py_UNICODE)ch;
1263#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001264 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001265
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001266 /* translate from 10000..10FFFF to 0..FFFF */
1267 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001268
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001269 /* high surrogate = top 10 bits added to D800 */
1270 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001271
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001272 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001273 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001274#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275 break;
1276
1277 default:
1278 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001279 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 }
1284 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001286
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001287 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001288 outpos = p-PyUnicode_AS_UNICODE(unicode);
1289 if (unicode_decode_call_errorhandler(
1290 errors, &errorHandler,
1291 "utf8", errmsg,
1292 starts, size, &startinpos, &endinpos, &exc, &s,
1293 (PyObject **)&unicode, &outpos, &p))
1294 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001295 }
1296
1297 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001298 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001299 goto onError;
1300
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001301 Py_XDECREF(errorHandler);
1302 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 return (PyObject *)unicode;
1304
1305onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001306 Py_XDECREF(errorHandler);
1307 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 Py_DECREF(unicode);
1309 return NULL;
1310}
1311
Tim Peters602f7402002-04-27 18:03:26 +00001312/* Allocation strategy: if the string is short, convert into a stack buffer
1313 and allocate exactly as much space needed at the end. Else allocate the
1314 maximum possible needed (4 result bytes per Unicode character), and return
1315 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001316*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001317PyObject *
1318PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1319 int size,
1320 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321{
Tim Peters602f7402002-04-27 18:03:26 +00001322#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001323
Tim Peters602f7402002-04-27 18:03:26 +00001324 int i; /* index into s of next input byte */
1325 PyObject *v; /* result string object */
1326 char *p; /* next free byte in output buffer */
1327 int nallocated; /* number of result bytes allocated */
1328 int nneeded; /* number of result bytes needed */
1329 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001330
Tim Peters602f7402002-04-27 18:03:26 +00001331 assert(s != NULL);
1332 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001333
Tim Peters602f7402002-04-27 18:03:26 +00001334 if (size <= MAX_SHORT_UNICHARS) {
1335 /* Write into the stack buffer; nallocated can't overflow.
1336 * At the end, we'll allocate exactly as much heap space as it
1337 * turns out we need.
1338 */
1339 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1340 v = NULL; /* will allocate after we're done */
1341 p = stackbuf;
1342 }
1343 else {
1344 /* Overallocate on the heap, and give the excess back at the end. */
1345 nallocated = size * 4;
1346 if (nallocated / 4 != size) /* overflow! */
1347 return PyErr_NoMemory();
1348 v = PyString_FromStringAndSize(NULL, nallocated);
1349 if (v == NULL)
1350 return NULL;
1351 p = PyString_AS_STRING(v);
1352 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001353
Tim Peters602f7402002-04-27 18:03:26 +00001354 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001355 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001356
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001357 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001358 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001360
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001362 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001363 *p++ = (char)(0xc0 | (ch >> 6));
1364 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001365 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001366 else {
Tim Peters602f7402002-04-27 18:03:26 +00001367 /* Encode UCS2 Unicode ordinals */
1368 if (ch < 0x10000) {
1369 /* Special case: check for high surrogate */
1370 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1371 Py_UCS4 ch2 = s[i];
1372 /* Check for low surrogate and combine the two to
1373 form a UCS4 value */
1374 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001375 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001376 i++;
1377 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001378 }
Tim Peters602f7402002-04-27 18:03:26 +00001379 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001380 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001381 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001382 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1383 *p++ = (char)(0x80 | (ch & 0x3f));
1384 continue;
1385 }
1386encodeUCS4:
1387 /* Encode UCS4 Unicode ordinals */
1388 *p++ = (char)(0xf0 | (ch >> 18));
1389 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1390 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1391 *p++ = (char)(0x80 | (ch & 0x3f));
1392 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001394
Tim Peters602f7402002-04-27 18:03:26 +00001395 if (v == NULL) {
1396 /* This was stack allocated. */
1397 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1398 assert(nneeded <= nallocated);
1399 v = PyString_FromStringAndSize(stackbuf, nneeded);
1400 }
1401 else {
1402 /* Cut back to size actually needed. */
1403 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1404 assert(nneeded <= nallocated);
1405 _PyString_Resize(&v, nneeded);
1406 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001407 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001408
Tim Peters602f7402002-04-27 18:03:26 +00001409#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410}
1411
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1413{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001414 if (!PyUnicode_Check(unicode)) {
1415 PyErr_BadArgument();
1416 return NULL;
1417 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001418 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1419 PyUnicode_GET_SIZE(unicode),
1420 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421}
1422
1423/* --- UTF-16 Codec ------------------------------------------------------- */
1424
Tim Peters772747b2001-08-09 22:21:55 +00001425PyObject *
1426PyUnicode_DecodeUTF16(const char *s,
1427 int size,
1428 const char *errors,
1429 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001431 const char *starts = s;
1432 int startinpos;
1433 int endinpos;
1434 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435 PyUnicodeObject *unicode;
1436 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001437 const unsigned char *q, *e;
1438 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001439 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001440 /* Offsets from q for retrieving byte pairs in the right order. */
1441#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1442 int ihi = 1, ilo = 0;
1443#else
1444 int ihi = 0, ilo = 1;
1445#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001446 PyObject *errorHandler = NULL;
1447 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001448
1449 /* Note: size will always be longer than the resulting Unicode
1450 character count */
1451 unicode = _PyUnicode_New(size);
1452 if (!unicode)
1453 return NULL;
1454 if (size == 0)
1455 return (PyObject *)unicode;
1456
1457 /* Unpack UTF-16 encoded data */
1458 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001459 q = (unsigned char *)s;
1460 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461
1462 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001463 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001464
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001465 /* Check for BOM marks (U+FEFF) in the input and adjust current
1466 byte order setting accordingly. In native mode, the leading BOM
1467 mark is skipped, in all other modes, it is copied to the output
1468 stream as-is (giving a ZWNBSP character). */
1469 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001470 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001471#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001472 if (bom == 0xFEFF) {
1473 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001474 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001475 }
1476 else if (bom == 0xFFFE) {
1477 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001478 bo = 1;
1479 }
Tim Petersced69f82003-09-16 20:30:58 +00001480#else
Tim Peters772747b2001-08-09 22:21:55 +00001481 if (bom == 0xFEFF) {
1482 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001483 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001484 }
1485 else if (bom == 0xFFFE) {
1486 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001487 bo = -1;
1488 }
1489#endif
1490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491
Tim Peters772747b2001-08-09 22:21:55 +00001492 if (bo == -1) {
1493 /* force LE */
1494 ihi = 1;
1495 ilo = 0;
1496 }
1497 else if (bo == 1) {
1498 /* force BE */
1499 ihi = 0;
1500 ilo = 1;
1501 }
1502
1503 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001504 Py_UNICODE ch;
1505 /* remaing bytes at the end? (size should be even) */
1506 if (e-q<2) {
1507 errmsg = "truncated data";
1508 startinpos = ((const char *)q)-starts;
1509 endinpos = ((const char *)e)-starts;
1510 goto utf16Error;
1511 /* The remaining input chars are ignored if the callback
1512 chooses to skip the input */
1513 }
1514 ch = (q[ihi] << 8) | q[ilo];
1515
Tim Peters772747b2001-08-09 22:21:55 +00001516 q += 2;
1517
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518 if (ch < 0xD800 || ch > 0xDFFF) {
1519 *p++ = ch;
1520 continue;
1521 }
1522
1523 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001524 if (q >= e) {
1525 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001526 startinpos = (((const char *)q)-2)-starts;
1527 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001528 goto utf16Error;
1529 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001530 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001531 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1532 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001533 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001534#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001535 *p++ = ch;
1536 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001537#else
1538 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001539#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001540 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001541 }
1542 else {
1543 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001544 startinpos = (((const char *)q)-4)-starts;
1545 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001546 goto utf16Error;
1547 }
1548
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001550 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001551 startinpos = (((const char *)q)-2)-starts;
1552 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001553 /* Fall through to report the error */
1554
1555 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001556 outpos = p-PyUnicode_AS_UNICODE(unicode);
1557 if (unicode_decode_call_errorhandler(
1558 errors, &errorHandler,
1559 "utf16", errmsg,
1560 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1561 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001562 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563 }
1564
1565 if (byteorder)
1566 *byteorder = bo;
1567
1568 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001569 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001570 goto onError;
1571
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001572 Py_XDECREF(errorHandler);
1573 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001574 return (PyObject *)unicode;
1575
1576onError:
1577 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001578 Py_XDECREF(errorHandler);
1579 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001580 return NULL;
1581}
1582
Tim Peters772747b2001-08-09 22:21:55 +00001583PyObject *
1584PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1585 int size,
1586 const char *errors,
1587 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001588{
1589 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001590 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001591#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001592 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001593#else
1594 const int pairs = 0;
1595#endif
Tim Peters772747b2001-08-09 22:21:55 +00001596 /* Offsets from p for storing byte pairs in the right order. */
1597#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1598 int ihi = 1, ilo = 0;
1599#else
1600 int ihi = 0, ilo = 1;
1601#endif
1602
1603#define STORECHAR(CH) \
1604 do { \
1605 p[ihi] = ((CH) >> 8) & 0xff; \
1606 p[ilo] = (CH) & 0xff; \
1607 p += 2; \
1608 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001609
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001610#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001611 for (i = pairs = 0; i < size; i++)
1612 if (s[i] >= 0x10000)
1613 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001614#endif
Tim Petersced69f82003-09-16 20:30:58 +00001615 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001616 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001617 if (v == NULL)
1618 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619
Tim Peters772747b2001-08-09 22:21:55 +00001620 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001621 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001622 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001623 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001624 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001625
1626 if (byteorder == -1) {
1627 /* force LE */
1628 ihi = 1;
1629 ilo = 0;
1630 }
1631 else if (byteorder == 1) {
1632 /* force BE */
1633 ihi = 0;
1634 ilo = 1;
1635 }
1636
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001637 while (size-- > 0) {
1638 Py_UNICODE ch = *s++;
1639 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001640#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001641 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001642 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1643 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001644 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001645#endif
Tim Peters772747b2001-08-09 22:21:55 +00001646 STORECHAR(ch);
1647 if (ch2)
1648 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001649 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001650 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001651#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652}
1653
1654PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1655{
1656 if (!PyUnicode_Check(unicode)) {
1657 PyErr_BadArgument();
1658 return NULL;
1659 }
1660 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1661 PyUnicode_GET_SIZE(unicode),
1662 NULL,
1663 0);
1664}
1665
1666/* --- Unicode Escape Codec ----------------------------------------------- */
1667
Fredrik Lundh06d12682001-01-24 07:59:11 +00001668static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001669
Guido van Rossumd57fd912000-03-10 22:53:23 +00001670PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1671 int size,
1672 const char *errors)
1673{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001674 const char *starts = s;
1675 int startinpos;
1676 int endinpos;
1677 int outpos;
1678 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001680 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001682 char* message;
1683 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001684 PyObject *errorHandler = NULL;
1685 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001686
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687 /* Escaped strings will always be longer than the resulting
1688 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001689 length after conversion to the true value.
1690 (but if the error callback returns a long replacement string
1691 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 v = _PyUnicode_New(size);
1693 if (v == NULL)
1694 goto onError;
1695 if (size == 0)
1696 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001697
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001698 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001699 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001700
Guido van Rossumd57fd912000-03-10 22:53:23 +00001701 while (s < end) {
1702 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001703 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001704 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001705
1706 /* Non-escape characters are interpreted as Unicode ordinals */
1707 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001708 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001709 continue;
1710 }
1711
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001712 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713 /* \ - Escapes */
1714 s++;
1715 switch (*s++) {
1716
1717 /* \x escapes */
1718 case '\n': break;
1719 case '\\': *p++ = '\\'; break;
1720 case '\'': *p++ = '\''; break;
1721 case '\"': *p++ = '\"'; break;
1722 case 'b': *p++ = '\b'; break;
1723 case 'f': *p++ = '\014'; break; /* FF */
1724 case 't': *p++ = '\t'; break;
1725 case 'n': *p++ = '\n'; break;
1726 case 'r': *p++ = '\r'; break;
1727 case 'v': *p++ = '\013'; break; /* VT */
1728 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1729
1730 /* \OOO (octal) escapes */
1731 case '0': case '1': case '2': case '3':
1732 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001733 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001735 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001737 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001739 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001740 break;
1741
Fredrik Lundhccc74732001-02-18 22:13:49 +00001742 /* hex escapes */
1743 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001744 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001745 digits = 2;
1746 message = "truncated \\xXX escape";
1747 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748
Fredrik Lundhccc74732001-02-18 22:13:49 +00001749 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001751 digits = 4;
1752 message = "truncated \\uXXXX escape";
1753 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754
Fredrik Lundhccc74732001-02-18 22:13:49 +00001755 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001756 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001757 digits = 8;
1758 message = "truncated \\UXXXXXXXX escape";
1759 hexescape:
1760 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001761 outpos = p-PyUnicode_AS_UNICODE(v);
1762 if (s+digits>end) {
1763 endinpos = size;
1764 if (unicode_decode_call_errorhandler(
1765 errors, &errorHandler,
1766 "unicodeescape", "end of string in escape sequence",
1767 starts, size, &startinpos, &endinpos, &exc, &s,
1768 (PyObject **)&v, &outpos, &p))
1769 goto onError;
1770 goto nextByte;
1771 }
1772 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001773 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001774 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 endinpos = (s+i+1)-starts;
1776 if (unicode_decode_call_errorhandler(
1777 errors, &errorHandler,
1778 "unicodeescape", message,
1779 starts, size, &startinpos, &endinpos, &exc, &s,
1780 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001781 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001782 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001783 }
1784 chr = (chr<<4) & ~0xF;
1785 if (c >= '0' && c <= '9')
1786 chr += c - '0';
1787 else if (c >= 'a' && c <= 'f')
1788 chr += 10 + c - 'a';
1789 else
1790 chr += 10 + c - 'A';
1791 }
1792 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001793 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001794 /* _decoding_error will have already written into the
1795 target buffer. */
1796 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001797 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001798 /* when we get here, chr is a 32-bit unicode character */
1799 if (chr <= 0xffff)
1800 /* UCS-2 character */
1801 *p++ = (Py_UNICODE) chr;
1802 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001803 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001804 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001805#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001806 *p++ = chr;
1807#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001808 chr -= 0x10000L;
1809 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001810 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001811#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001812 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001813 endinpos = s-starts;
1814 outpos = p-PyUnicode_AS_UNICODE(v);
1815 if (unicode_decode_call_errorhandler(
1816 errors, &errorHandler,
1817 "unicodeescape", "illegal Unicode character",
1818 starts, size, &startinpos, &endinpos, &exc, &s,
1819 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001820 goto onError;
1821 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001822 break;
1823
1824 /* \N{name} */
1825 case 'N':
1826 message = "malformed \\N character escape";
1827 if (ucnhash_CAPI == NULL) {
1828 /* load the unicode data module */
1829 PyObject *m, *v;
1830 m = PyImport_ImportModule("unicodedata");
1831 if (m == NULL)
1832 goto ucnhashError;
1833 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1834 Py_DECREF(m);
1835 if (v == NULL)
1836 goto ucnhashError;
1837 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1838 Py_DECREF(v);
1839 if (ucnhash_CAPI == NULL)
1840 goto ucnhashError;
1841 }
1842 if (*s == '{') {
1843 const char *start = s+1;
1844 /* look for the closing brace */
1845 while (*s != '}' && s < end)
1846 s++;
1847 if (s > start && s < end && *s == '}') {
1848 /* found a name. look it up in the unicode database */
1849 message = "unknown Unicode character name";
1850 s++;
1851 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1852 goto store;
1853 }
1854 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001855 endinpos = s-starts;
1856 outpos = p-PyUnicode_AS_UNICODE(v);
1857 if (unicode_decode_call_errorhandler(
1858 errors, &errorHandler,
1859 "unicodeescape", message,
1860 starts, size, &startinpos, &endinpos, &exc, &s,
1861 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001862 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001863 break;
1864
1865 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001866 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 message = "\\ at end of string";
1868 s--;
1869 endinpos = s-starts;
1870 outpos = p-PyUnicode_AS_UNICODE(v);
1871 if (unicode_decode_call_errorhandler(
1872 errors, &errorHandler,
1873 "unicodeescape", message,
1874 starts, size, &startinpos, &endinpos, &exc, &s,
1875 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001876 goto onError;
1877 }
1878 else {
1879 *p++ = '\\';
1880 *p++ = (unsigned char)s[-1];
1881 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001882 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001884 nextByte:
1885 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001887 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001888 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001889 Py_XDECREF(errorHandler);
1890 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001891 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001892
Fredrik Lundhccc74732001-02-18 22:13:49 +00001893ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001894 PyErr_SetString(
1895 PyExc_UnicodeError,
1896 "\\N escapes not supported (can't load unicodedata module)"
1897 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001898 Py_XDECREF(errorHandler);
1899 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001900 return NULL;
1901
Fredrik Lundhccc74732001-02-18 22:13:49 +00001902onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001903 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001904 Py_XDECREF(errorHandler);
1905 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001906 return NULL;
1907}
1908
1909/* Return a Unicode-Escape string version of the Unicode object.
1910
1911 If quotes is true, the string is enclosed in u"" or u'' quotes as
1912 appropriate.
1913
1914*/
1915
Barry Warsaw51ac5802000-03-20 16:36:48 +00001916static const Py_UNICODE *findchar(const Py_UNICODE *s,
1917 int size,
1918 Py_UNICODE ch);
1919
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920static
1921PyObject *unicodeescape_string(const Py_UNICODE *s,
1922 int size,
1923 int quotes)
1924{
1925 PyObject *repr;
1926 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001928 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929
1930 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1931 if (repr == NULL)
1932 return NULL;
1933
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001934 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935
1936 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001938 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 !findchar(s, size, '"')) ? '"' : '\'';
1940 }
1941 while (size-- > 0) {
1942 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001943
Guido van Rossumd57fd912000-03-10 22:53:23 +00001944 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001945 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001946 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 *p++ = '\\';
1948 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001949 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001950 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001951
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001952#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001953 /* Map 21-bit characters to '\U00xxxxxx' */
1954 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001955 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00001956
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001957 /* Resize the string if necessary */
1958 if (offset + 12 > PyString_GET_SIZE(repr)) {
1959 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001960 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001961 p = PyString_AS_STRING(repr) + offset;
1962 }
1963
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001964 *p++ = '\\';
1965 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001966 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1967 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1968 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1969 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1970 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1971 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1972 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001973 *p++ = hexdigit[ch & 0x0000000F];
1974 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001975 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001976#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001977 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1978 else if (ch >= 0xD800 && ch < 0xDC00) {
1979 Py_UNICODE ch2;
1980 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00001981
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001982 ch2 = *s++;
1983 size--;
1984 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1985 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1986 *p++ = '\\';
1987 *p++ = 'U';
1988 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1989 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1990 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1991 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1992 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1993 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1994 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1995 *p++ = hexdigit[ucs & 0x0000000F];
1996 continue;
1997 }
1998 /* Fall through: isolated surrogates are copied as-is */
1999 s--;
2000 size++;
2001 }
2002
Guido van Rossumd57fd912000-03-10 22:53:23 +00002003 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002004 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 *p++ = '\\';
2006 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002007 *p++ = hexdigit[(ch >> 12) & 0x000F];
2008 *p++ = hexdigit[(ch >> 8) & 0x000F];
2009 *p++ = hexdigit[(ch >> 4) & 0x000F];
2010 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002012
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002013 /* Map special whitespace to '\t', \n', '\r' */
2014 else if (ch == '\t') {
2015 *p++ = '\\';
2016 *p++ = 't';
2017 }
2018 else if (ch == '\n') {
2019 *p++ = '\\';
2020 *p++ = 'n';
2021 }
2022 else if (ch == '\r') {
2023 *p++ = '\\';
2024 *p++ = 'r';
2025 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002026
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002027 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002028 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002029 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002030 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002031 *p++ = hexdigit[(ch >> 4) & 0x000F];
2032 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002033 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002034
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 /* Copy everything else as-is */
2036 else
2037 *p++ = (char) ch;
2038 }
2039 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002040 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041
2042 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002043 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 return repr;
2045}
2046
2047PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2048 int size)
2049{
2050 return unicodeescape_string(s, size, 0);
2051}
2052
2053PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2054{
2055 if (!PyUnicode_Check(unicode)) {
2056 PyErr_BadArgument();
2057 return NULL;
2058 }
2059 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2060 PyUnicode_GET_SIZE(unicode));
2061}
2062
2063/* --- Raw Unicode Escape Codec ------------------------------------------- */
2064
2065PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2066 int size,
2067 const char *errors)
2068{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002069 const char *starts = s;
2070 int startinpos;
2071 int endinpos;
2072 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002074 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075 const char *end;
2076 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002077 PyObject *errorHandler = NULL;
2078 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002079
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080 /* Escaped strings will always be longer than the resulting
2081 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002082 length after conversion to the true value. (But decoding error
2083 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084 v = _PyUnicode_New(size);
2085 if (v == NULL)
2086 goto onError;
2087 if (size == 0)
2088 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002089 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002090 end = s + size;
2091 while (s < end) {
2092 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002093 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002095 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096
2097 /* Non-escape characters are interpreted as Unicode ordinals */
2098 if (*s != '\\') {
2099 *p++ = (unsigned char)*s++;
2100 continue;
2101 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002102 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002103
2104 /* \u-escapes are only interpreted iff the number of leading
2105 backslashes if odd */
2106 bs = s;
2107 for (;s < end;) {
2108 if (*s != '\\')
2109 break;
2110 *p++ = (unsigned char)*s++;
2111 }
2112 if (((s - bs) & 1) == 0 ||
2113 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002114 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002115 continue;
2116 }
2117 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002118 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002119 s++;
2120
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002121 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002122 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002123 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002124 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002126 endinpos = s-starts;
2127 if (unicode_decode_call_errorhandler(
2128 errors, &errorHandler,
2129 "rawunicodeescape", "truncated \\uXXXX",
2130 starts, size, &startinpos, &endinpos, &exc, &s,
2131 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002132 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002133 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134 }
2135 x = (x<<4) & ~0xF;
2136 if (c >= '0' && c <= '9')
2137 x += c - '0';
2138 else if (c >= 'a' && c <= 'f')
2139 x += 10 + c - 'a';
2140 else
2141 x += 10 + c - 'A';
2142 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002143#ifndef Py_UNICODE_WIDE
2144 if (x > 0x10000) {
2145 if (unicode_decode_call_errorhandler(
2146 errors, &errorHandler,
2147 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2148 starts, size, &startinpos, &endinpos, &exc, &s,
2149 (PyObject **)&v, &outpos, &p))
2150 goto onError;
2151 }
2152#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002153 *p++ = x;
2154 nextByte:
2155 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002157 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002158 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002159 Py_XDECREF(errorHandler);
2160 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002162
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163 onError:
2164 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002165 Py_XDECREF(errorHandler);
2166 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167 return NULL;
2168}
2169
2170PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2171 int size)
2172{
2173 PyObject *repr;
2174 char *p;
2175 char *q;
2176
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002177 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002178
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002179#ifdef Py_UNICODE_WIDE
2180 repr = PyString_FromStringAndSize(NULL, 10 * size);
2181#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002183#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002184 if (repr == NULL)
2185 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002186 if (size == 0)
2187 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188
2189 p = q = PyString_AS_STRING(repr);
2190 while (size-- > 0) {
2191 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002192#ifdef Py_UNICODE_WIDE
2193 /* Map 32-bit characters to '\Uxxxxxxxx' */
2194 if (ch >= 0x10000) {
2195 *p++ = '\\';
2196 *p++ = 'U';
2197 *p++ = hexdigit[(ch >> 28) & 0xf];
2198 *p++ = hexdigit[(ch >> 24) & 0xf];
2199 *p++ = hexdigit[(ch >> 20) & 0xf];
2200 *p++ = hexdigit[(ch >> 16) & 0xf];
2201 *p++ = hexdigit[(ch >> 12) & 0xf];
2202 *p++ = hexdigit[(ch >> 8) & 0xf];
2203 *p++ = hexdigit[(ch >> 4) & 0xf];
2204 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002205 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002206 else
2207#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002208 /* Map 16-bit characters to '\uxxxx' */
2209 if (ch >= 256) {
2210 *p++ = '\\';
2211 *p++ = 'u';
2212 *p++ = hexdigit[(ch >> 12) & 0xf];
2213 *p++ = hexdigit[(ch >> 8) & 0xf];
2214 *p++ = hexdigit[(ch >> 4) & 0xf];
2215 *p++ = hexdigit[ch & 15];
2216 }
2217 /* Copy everything else as-is */
2218 else
2219 *p++ = (char) ch;
2220 }
2221 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002222 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223 return repr;
2224}
2225
2226PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2227{
2228 if (!PyUnicode_Check(unicode)) {
2229 PyErr_BadArgument();
2230 return NULL;
2231 }
2232 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2233 PyUnicode_GET_SIZE(unicode));
2234}
2235
2236/* --- Latin-1 Codec ------------------------------------------------------ */
2237
2238PyObject *PyUnicode_DecodeLatin1(const char *s,
2239 int size,
2240 const char *errors)
2241{
2242 PyUnicodeObject *v;
2243 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002244
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002246 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002247 Py_UNICODE r = *(unsigned char*)s;
2248 return PyUnicode_FromUnicode(&r, 1);
2249 }
2250
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251 v = _PyUnicode_New(size);
2252 if (v == NULL)
2253 goto onError;
2254 if (size == 0)
2255 return (PyObject *)v;
2256 p = PyUnicode_AS_UNICODE(v);
2257 while (size-- > 0)
2258 *p++ = (unsigned char)*s++;
2259 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002260
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 onError:
2262 Py_XDECREF(v);
2263 return NULL;
2264}
2265
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266/* create or adjust a UnicodeEncodeError */
2267static void make_encode_exception(PyObject **exceptionObject,
2268 const char *encoding,
2269 const Py_UNICODE *unicode, int size,
2270 int startpos, int endpos,
2271 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002273 if (*exceptionObject == NULL) {
2274 *exceptionObject = PyUnicodeEncodeError_Create(
2275 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 }
2277 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002278 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2279 goto onError;
2280 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2281 goto onError;
2282 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2283 goto onError;
2284 return;
2285 onError:
2286 Py_DECREF(*exceptionObject);
2287 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288 }
2289}
2290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002291/* raises a UnicodeEncodeError */
2292static void raise_encode_exception(PyObject **exceptionObject,
2293 const char *encoding,
2294 const Py_UNICODE *unicode, int size,
2295 int startpos, int endpos,
2296 const char *reason)
2297{
2298 make_encode_exception(exceptionObject,
2299 encoding, unicode, size, startpos, endpos, reason);
2300 if (*exceptionObject != NULL)
2301 PyCodec_StrictErrors(*exceptionObject);
2302}
2303
2304/* error handling callback helper:
2305 build arguments, call the callback and check the arguments,
2306 put the result into newpos and return the replacement string, which
2307 has to be freed by the caller */
2308static PyObject *unicode_encode_call_errorhandler(const char *errors,
2309 PyObject **errorHandler,
2310 const char *encoding, const char *reason,
2311 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2312 int startpos, int endpos,
2313 int *newpos)
2314{
2315 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2316
2317 PyObject *restuple;
2318 PyObject *resunicode;
2319
2320 if (*errorHandler == NULL) {
2321 *errorHandler = PyCodec_LookupError(errors);
2322 if (*errorHandler == NULL)
2323 return NULL;
2324 }
2325
2326 make_encode_exception(exceptionObject,
2327 encoding, unicode, size, startpos, endpos, reason);
2328 if (*exceptionObject == NULL)
2329 return NULL;
2330
2331 restuple = PyObject_CallFunctionObjArgs(
2332 *errorHandler, *exceptionObject, NULL);
2333 if (restuple == NULL)
2334 return NULL;
2335 if (!PyTuple_Check(restuple)) {
2336 PyErr_Format(PyExc_TypeError, &argparse[4]);
2337 Py_DECREF(restuple);
2338 return NULL;
2339 }
2340 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2341 &resunicode, newpos)) {
2342 Py_DECREF(restuple);
2343 return NULL;
2344 }
2345 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002346 *newpos = size+*newpos;
2347 if (*newpos<0 || *newpos>size) {
2348 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2349 Py_DECREF(restuple);
2350 return NULL;
2351 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002352 Py_INCREF(resunicode);
2353 Py_DECREF(restuple);
2354 return resunicode;
2355}
2356
2357static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2358 int size,
2359 const char *errors,
2360 int limit)
2361{
2362 /* output object */
2363 PyObject *res;
2364 /* pointers to the beginning and end+1 of input */
2365 const Py_UNICODE *startp = p;
2366 const Py_UNICODE *endp = p + size;
2367 /* pointer to the beginning of the unencodable characters */
2368 /* const Py_UNICODE *badp = NULL; */
2369 /* pointer into the output */
2370 char *str;
2371 /* current output position */
2372 int respos = 0;
2373 int ressize;
2374 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2375 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2376 PyObject *errorHandler = NULL;
2377 PyObject *exc = NULL;
2378 /* the following variable is used for caching string comparisons
2379 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2380 int known_errorHandler = -1;
2381
2382 /* allocate enough for a simple encoding without
2383 replacements, if we need more, we'll resize */
2384 res = PyString_FromStringAndSize(NULL, size);
2385 if (res == NULL)
2386 goto onError;
2387 if (size == 0)
2388 return res;
2389 str = PyString_AS_STRING(res);
2390 ressize = size;
2391
2392 while (p<endp) {
2393 Py_UNICODE c = *p;
2394
2395 /* can we encode this? */
2396 if (c<limit) {
2397 /* no overflow check, because we know that the space is enough */
2398 *str++ = (char)c;
2399 ++p;
2400 }
2401 else {
2402 int unicodepos = p-startp;
2403 int requiredsize;
2404 PyObject *repunicode;
2405 int repsize;
2406 int newpos;
2407 int respos;
2408 Py_UNICODE *uni2;
2409 /* startpos for collecting unencodable chars */
2410 const Py_UNICODE *collstart = p;
2411 const Py_UNICODE *collend = p;
2412 /* find all unecodable characters */
2413 while ((collend < endp) && ((*collend)>=limit))
2414 ++collend;
2415 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2416 if (known_errorHandler==-1) {
2417 if ((errors==NULL) || (!strcmp(errors, "strict")))
2418 known_errorHandler = 1;
2419 else if (!strcmp(errors, "replace"))
2420 known_errorHandler = 2;
2421 else if (!strcmp(errors, "ignore"))
2422 known_errorHandler = 3;
2423 else if (!strcmp(errors, "xmlcharrefreplace"))
2424 known_errorHandler = 4;
2425 else
2426 known_errorHandler = 0;
2427 }
2428 switch (known_errorHandler) {
2429 case 1: /* strict */
2430 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2431 goto onError;
2432 case 2: /* replace */
2433 while (collstart++<collend)
2434 *str++ = '?'; /* fall through */
2435 case 3: /* ignore */
2436 p = collend;
2437 break;
2438 case 4: /* xmlcharrefreplace */
2439 respos = str-PyString_AS_STRING(res);
2440 /* determine replacement size (temporarily (mis)uses p) */
2441 for (p = collstart, repsize = 0; p < collend; ++p) {
2442 if (*p<10)
2443 repsize += 2+1+1;
2444 else if (*p<100)
2445 repsize += 2+2+1;
2446 else if (*p<1000)
2447 repsize += 2+3+1;
2448 else if (*p<10000)
2449 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002450#ifndef Py_UNICODE_WIDE
2451 else
2452 repsize += 2+5+1;
2453#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002454 else if (*p<100000)
2455 repsize += 2+5+1;
2456 else if (*p<1000000)
2457 repsize += 2+6+1;
2458 else
2459 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002460#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002461 }
2462 requiredsize = respos+repsize+(endp-collend);
2463 if (requiredsize > ressize) {
2464 if (requiredsize<2*ressize)
2465 requiredsize = 2*ressize;
2466 if (_PyString_Resize(&res, requiredsize))
2467 goto onError;
2468 str = PyString_AS_STRING(res) + respos;
2469 ressize = requiredsize;
2470 }
2471 /* generate replacement (temporarily (mis)uses p) */
2472 for (p = collstart; p < collend; ++p) {
2473 str += sprintf(str, "&#%d;", (int)*p);
2474 }
2475 p = collend;
2476 break;
2477 default:
2478 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2479 encoding, reason, startp, size, &exc,
2480 collstart-startp, collend-startp, &newpos);
2481 if (repunicode == NULL)
2482 goto onError;
2483 /* need more space? (at least enough for what we
2484 have+the replacement+the rest of the string, so
2485 we won't have to check space for encodable characters) */
2486 respos = str-PyString_AS_STRING(res);
2487 repsize = PyUnicode_GET_SIZE(repunicode);
2488 requiredsize = respos+repsize+(endp-collend);
2489 if (requiredsize > ressize) {
2490 if (requiredsize<2*ressize)
2491 requiredsize = 2*ressize;
2492 if (_PyString_Resize(&res, requiredsize)) {
2493 Py_DECREF(repunicode);
2494 goto onError;
2495 }
2496 str = PyString_AS_STRING(res) + respos;
2497 ressize = requiredsize;
2498 }
2499 /* check if there is anything unencodable in the replacement
2500 and copy it to the output */
2501 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2502 c = *uni2;
2503 if (c >= limit) {
2504 raise_encode_exception(&exc, encoding, startp, size,
2505 unicodepos, unicodepos+1, reason);
2506 Py_DECREF(repunicode);
2507 goto onError;
2508 }
2509 *str = (char)c;
2510 }
2511 p = startp + newpos;
2512 Py_DECREF(repunicode);
2513 }
2514 }
2515 }
2516 /* Resize if we allocated to much */
2517 respos = str-PyString_AS_STRING(res);
2518 if (respos<ressize)
2519 /* If this falls res will be NULL */
2520 _PyString_Resize(&res, respos);
2521 Py_XDECREF(errorHandler);
2522 Py_XDECREF(exc);
2523 return res;
2524
2525 onError:
2526 Py_XDECREF(res);
2527 Py_XDECREF(errorHandler);
2528 Py_XDECREF(exc);
2529 return NULL;
2530}
2531
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2533 int size,
2534 const char *errors)
2535{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002536 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537}
2538
2539PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2540{
2541 if (!PyUnicode_Check(unicode)) {
2542 PyErr_BadArgument();
2543 return NULL;
2544 }
2545 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2546 PyUnicode_GET_SIZE(unicode),
2547 NULL);
2548}
2549
2550/* --- 7-bit ASCII Codec -------------------------------------------------- */
2551
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552PyObject *PyUnicode_DecodeASCII(const char *s,
2553 int size,
2554 const char *errors)
2555{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002556 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 PyUnicodeObject *v;
2558 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002559 int startinpos;
2560 int endinpos;
2561 int outpos;
2562 const char *e;
2563 PyObject *errorHandler = NULL;
2564 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002565
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002567 if (size == 1 && *(unsigned char*)s < 128) {
2568 Py_UNICODE r = *(unsigned char*)s;
2569 return PyUnicode_FromUnicode(&r, 1);
2570 }
Tim Petersced69f82003-09-16 20:30:58 +00002571
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572 v = _PyUnicode_New(size);
2573 if (v == NULL)
2574 goto onError;
2575 if (size == 0)
2576 return (PyObject *)v;
2577 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002578 e = s + size;
2579 while (s < e) {
2580 register unsigned char c = (unsigned char)*s;
2581 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002583 ++s;
2584 }
2585 else {
2586 startinpos = s-starts;
2587 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002588 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002589 if (unicode_decode_call_errorhandler(
2590 errors, &errorHandler,
2591 "ascii", "ordinal not in range(128)",
2592 starts, size, &startinpos, &endinpos, &exc, &s,
2593 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002597 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002598 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002599 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002600 Py_XDECREF(errorHandler);
2601 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002603
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 onError:
2605 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002606 Py_XDECREF(errorHandler);
2607 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608 return NULL;
2609}
2610
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2612 int size,
2613 const char *errors)
2614{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002615 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616}
2617
2618PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2619{
2620 if (!PyUnicode_Check(unicode)) {
2621 PyErr_BadArgument();
2622 return NULL;
2623 }
2624 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2625 PyUnicode_GET_SIZE(unicode),
2626 NULL);
2627}
2628
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002629#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002630
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002631/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002632
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002633PyObject *PyUnicode_DecodeMBCS(const char *s,
2634 int size,
2635 const char *errors)
2636{
2637 PyUnicodeObject *v;
2638 Py_UNICODE *p;
2639
2640 /* First get the size of the result */
2641 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002642 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002643 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2644
2645 v = _PyUnicode_New(usize);
2646 if (v == NULL)
2647 return NULL;
2648 if (usize == 0)
2649 return (PyObject *)v;
2650 p = PyUnicode_AS_UNICODE(v);
2651 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2652 Py_DECREF(v);
2653 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2654 }
2655
2656 return (PyObject *)v;
2657}
2658
2659PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2660 int size,
2661 const char *errors)
2662{
2663 PyObject *repr;
2664 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002665 DWORD mbcssize;
2666
2667 /* If there are no characters, bail now! */
2668 if (size==0)
2669 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002670
2671 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002672 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002673 if (mbcssize==0)
2674 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2675
2676 repr = PyString_FromStringAndSize(NULL, mbcssize);
2677 if (repr == NULL)
2678 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002679 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002680 return repr;
2681
2682 /* Do the conversion */
2683 s = PyString_AS_STRING(repr);
2684 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2685 Py_DECREF(repr);
2686 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2687 }
2688 return repr;
2689}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002690
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002691PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2692{
2693 if (!PyUnicode_Check(unicode)) {
2694 PyErr_BadArgument();
2695 return NULL;
2696 }
2697 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2698 PyUnicode_GET_SIZE(unicode),
2699 NULL);
2700}
2701
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002702#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002703
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704/* --- Character Mapping Codec -------------------------------------------- */
2705
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706PyObject *PyUnicode_DecodeCharmap(const char *s,
2707 int size,
2708 PyObject *mapping,
2709 const char *errors)
2710{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002711 const char *starts = s;
2712 int startinpos;
2713 int endinpos;
2714 int outpos;
2715 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716 PyUnicodeObject *v;
2717 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002718 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002719 PyObject *errorHandler = NULL;
2720 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002721
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722 /* Default to Latin-1 */
2723 if (mapping == NULL)
2724 return PyUnicode_DecodeLatin1(s, size, errors);
2725
2726 v = _PyUnicode_New(size);
2727 if (v == NULL)
2728 goto onError;
2729 if (size == 0)
2730 return (PyObject *)v;
2731 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002732 e = s + size;
2733 while (s < e) {
2734 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 PyObject *w, *x;
2736
2737 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2738 w = PyInt_FromLong((long)ch);
2739 if (w == NULL)
2740 goto onError;
2741 x = PyObject_GetItem(mapping, w);
2742 Py_DECREF(w);
2743 if (x == NULL) {
2744 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002745 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002747 x = Py_None;
2748 Py_INCREF(x);
2749 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002750 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 }
2752
2753 /* Apply mapping */
2754 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002755 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 if (value < 0 || value > 65535) {
2757 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002758 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 Py_DECREF(x);
2760 goto onError;
2761 }
2762 *p++ = (Py_UNICODE)value;
2763 }
2764 else if (x == Py_None) {
2765 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002766 outpos = p-PyUnicode_AS_UNICODE(v);
2767 startinpos = s-starts;
2768 endinpos = startinpos+1;
2769 if (unicode_decode_call_errorhandler(
2770 errors, &errorHandler,
2771 "charmap", "character maps to <undefined>",
2772 starts, size, &startinpos, &endinpos, &exc, &s,
2773 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774 Py_DECREF(x);
2775 goto onError;
2776 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002777 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778 }
2779 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002780 int targetsize = PyUnicode_GET_SIZE(x);
2781
2782 if (targetsize == 1)
2783 /* 1-1 mapping */
2784 *p++ = *PyUnicode_AS_UNICODE(x);
2785
2786 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002788 if (targetsize > extrachars) {
2789 /* resize first */
2790 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2791 int needed = (targetsize - extrachars) + \
2792 (targetsize << 2);
2793 extrachars += needed;
Tim Petersced69f82003-09-16 20:30:58 +00002794 if (_PyUnicode_Resize(&v,
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002795 PyUnicode_GET_SIZE(v) + needed) < 0) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002796 Py_DECREF(x);
2797 goto onError;
2798 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002799 p = PyUnicode_AS_UNICODE(v) + oldpos;
2800 }
2801 Py_UNICODE_COPY(p,
2802 PyUnicode_AS_UNICODE(x),
2803 targetsize);
2804 p += targetsize;
2805 extrachars -= targetsize;
2806 }
2807 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 }
2809 else {
2810 /* wrong return value */
2811 PyErr_SetString(PyExc_TypeError,
2812 "character mapping must return integer, None or unicode");
2813 Py_DECREF(x);
2814 goto onError;
2815 }
2816 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002817 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818 }
2819 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002820 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822 Py_XDECREF(errorHandler);
2823 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002825
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002827 Py_XDECREF(errorHandler);
2828 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829 Py_XDECREF(v);
2830 return NULL;
2831}
2832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002833/* Lookup the character ch in the mapping. If the character
2834 can't be found, Py_None is returned (or NULL, if another
2835 error occured). */
2836static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002838 PyObject *w = PyInt_FromLong((long)c);
2839 PyObject *x;
2840
2841 if (w == NULL)
2842 return NULL;
2843 x = PyObject_GetItem(mapping, w);
2844 Py_DECREF(w);
2845 if (x == NULL) {
2846 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2847 /* No mapping found means: mapping is undefined. */
2848 PyErr_Clear();
2849 x = Py_None;
2850 Py_INCREF(x);
2851 return x;
2852 } else
2853 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002855 else if (x == Py_None)
2856 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002857 else if (PyInt_Check(x)) {
2858 long value = PyInt_AS_LONG(x);
2859 if (value < 0 || value > 255) {
2860 PyErr_SetString(PyExc_TypeError,
2861 "character mapping must be in range(256)");
2862 Py_DECREF(x);
2863 return NULL;
2864 }
2865 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002867 else if (PyString_Check(x))
2868 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002870 /* wrong return value */
2871 PyErr_SetString(PyExc_TypeError,
2872 "character mapping must return integer, None or str");
2873 Py_DECREF(x);
2874 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875 }
2876}
2877
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002878/* lookup the character, put the result in the output string and adjust
2879 various state variables. Reallocate the output string if not enough
2880 space is available. Return a new reference to the object that
2881 was put in the output buffer, or Py_None, if the mapping was undefined
2882 (in which case no character was written) or NULL, if a
2883 reallocation error ocurred. The called must decref the result */
2884static
2885PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2886 PyObject **outobj, int *outpos)
2887{
2888 PyObject *rep = charmapencode_lookup(c, mapping);
2889
2890 if (rep==NULL)
2891 return NULL;
2892 else if (rep==Py_None)
2893 return rep;
2894 else {
2895 char *outstart = PyString_AS_STRING(*outobj);
2896 int outsize = PyString_GET_SIZE(*outobj);
2897 if (PyInt_Check(rep)) {
2898 int requiredsize = *outpos+1;
2899 if (outsize<requiredsize) {
2900 /* exponentially overallocate to minimize reallocations */
2901 if (requiredsize < 2*outsize)
2902 requiredsize = 2*outsize;
2903 if (_PyString_Resize(outobj, requiredsize)) {
2904 Py_DECREF(rep);
2905 return NULL;
2906 }
2907 outstart = PyString_AS_STRING(*outobj);
2908 }
2909 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2910 }
2911 else {
2912 const char *repchars = PyString_AS_STRING(rep);
2913 int repsize = PyString_GET_SIZE(rep);
2914 int requiredsize = *outpos+repsize;
2915 if (outsize<requiredsize) {
2916 /* exponentially overallocate to minimize reallocations */
2917 if (requiredsize < 2*outsize)
2918 requiredsize = 2*outsize;
2919 if (_PyString_Resize(outobj, requiredsize)) {
2920 Py_DECREF(rep);
2921 return NULL;
2922 }
2923 outstart = PyString_AS_STRING(*outobj);
2924 }
2925 memcpy(outstart + *outpos, repchars, repsize);
2926 *outpos += repsize;
2927 }
2928 }
2929 return rep;
2930}
2931
2932/* handle an error in PyUnicode_EncodeCharmap
2933 Return 0 on success, -1 on error */
2934static
2935int charmap_encoding_error(
2936 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2937 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002938 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002939 PyObject **res, int *respos)
2940{
2941 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2942 int repsize;
2943 int newpos;
2944 Py_UNICODE *uni2;
2945 /* startpos for collecting unencodable chars */
2946 int collstartpos = *inpos;
2947 int collendpos = *inpos+1;
2948 int collpos;
2949 char *encoding = "charmap";
2950 char *reason = "character maps to <undefined>";
2951
2952 PyObject *x;
2953 /* find all unencodable characters */
2954 while (collendpos < size) {
2955 x = charmapencode_lookup(p[collendpos], mapping);
2956 if (x==NULL)
2957 return -1;
2958 else if (x!=Py_None) {
2959 Py_DECREF(x);
2960 break;
2961 }
2962 Py_DECREF(x);
2963 ++collendpos;
2964 }
2965 /* cache callback name lookup
2966 * (if not done yet, i.e. it's the first error) */
2967 if (*known_errorHandler==-1) {
2968 if ((errors==NULL) || (!strcmp(errors, "strict")))
2969 *known_errorHandler = 1;
2970 else if (!strcmp(errors, "replace"))
2971 *known_errorHandler = 2;
2972 else if (!strcmp(errors, "ignore"))
2973 *known_errorHandler = 3;
2974 else if (!strcmp(errors, "xmlcharrefreplace"))
2975 *known_errorHandler = 4;
2976 else
2977 *known_errorHandler = 0;
2978 }
2979 switch (*known_errorHandler) {
2980 case 1: /* strict */
2981 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2982 return -1;
2983 case 2: /* replace */
2984 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2985 x = charmapencode_output('?', mapping, res, respos);
2986 if (x==NULL) {
2987 return -1;
2988 }
2989 else if (x==Py_None) {
2990 Py_DECREF(x);
2991 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2992 return -1;
2993 }
2994 Py_DECREF(x);
2995 }
2996 /* fall through */
2997 case 3: /* ignore */
2998 *inpos = collendpos;
2999 break;
3000 case 4: /* xmlcharrefreplace */
3001 /* generate replacement (temporarily (mis)uses p) */
3002 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3003 char buffer[2+29+1+1];
3004 char *cp;
3005 sprintf(buffer, "&#%d;", (int)p[collpos]);
3006 for (cp = buffer; *cp; ++cp) {
3007 x = charmapencode_output(*cp, mapping, res, respos);
3008 if (x==NULL)
3009 return -1;
3010 else if (x==Py_None) {
3011 Py_DECREF(x);
3012 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3013 return -1;
3014 }
3015 Py_DECREF(x);
3016 }
3017 }
3018 *inpos = collendpos;
3019 break;
3020 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003021 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003022 encoding, reason, p, size, exceptionObject,
3023 collstartpos, collendpos, &newpos);
3024 if (repunicode == NULL)
3025 return -1;
3026 /* generate replacement */
3027 repsize = PyUnicode_GET_SIZE(repunicode);
3028 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3029 x = charmapencode_output(*uni2, mapping, res, respos);
3030 if (x==NULL) {
3031 Py_DECREF(repunicode);
3032 return -1;
3033 }
3034 else if (x==Py_None) {
3035 Py_DECREF(repunicode);
3036 Py_DECREF(x);
3037 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3038 return -1;
3039 }
3040 Py_DECREF(x);
3041 }
3042 *inpos = newpos;
3043 Py_DECREF(repunicode);
3044 }
3045 return 0;
3046}
3047
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3049 int size,
3050 PyObject *mapping,
3051 const char *errors)
3052{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 /* output object */
3054 PyObject *res = NULL;
3055 /* current input position */
3056 int inpos = 0;
3057 /* current output position */
3058 int respos = 0;
3059 PyObject *errorHandler = NULL;
3060 PyObject *exc = NULL;
3061 /* the following variable is used for caching string comparisons
3062 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3063 * 3=ignore, 4=xmlcharrefreplace */
3064 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065
3066 /* Default to Latin-1 */
3067 if (mapping == NULL)
3068 return PyUnicode_EncodeLatin1(p, size, errors);
3069
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070 /* allocate enough for a simple encoding without
3071 replacements, if we need more, we'll resize */
3072 res = PyString_FromStringAndSize(NULL, size);
3073 if (res == NULL)
3074 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003075 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003076 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003077
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003078 while (inpos<size) {
3079 /* try to encode it */
3080 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3081 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003083 if (x==Py_None) { /* unencodable character */
3084 if (charmap_encoding_error(p, size, &inpos, mapping,
3085 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003086 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003087 &res, &respos)) {
3088 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003089 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003092 else
3093 /* done with this character => adjust input position */
3094 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095 Py_DECREF(x);
3096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003098 /* Resize if we allocated to much */
3099 if (respos<PyString_GET_SIZE(res)) {
3100 if (_PyString_Resize(&res, respos))
3101 goto onError;
3102 }
3103 Py_XDECREF(exc);
3104 Py_XDECREF(errorHandler);
3105 return res;
3106
3107 onError:
3108 Py_XDECREF(res);
3109 Py_XDECREF(exc);
3110 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111 return NULL;
3112}
3113
3114PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3115 PyObject *mapping)
3116{
3117 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3118 PyErr_BadArgument();
3119 return NULL;
3120 }
3121 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3122 PyUnicode_GET_SIZE(unicode),
3123 mapping,
3124 NULL);
3125}
3126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003127/* create or adjust a UnicodeTranslateError */
3128static void make_translate_exception(PyObject **exceptionObject,
3129 const Py_UNICODE *unicode, int size,
3130 int startpos, int endpos,
3131 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003133 if (*exceptionObject == NULL) {
3134 *exceptionObject = PyUnicodeTranslateError_Create(
3135 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136 }
3137 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003138 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3139 goto onError;
3140 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3141 goto onError;
3142 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3143 goto onError;
3144 return;
3145 onError:
3146 Py_DECREF(*exceptionObject);
3147 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148 }
3149}
3150
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003151/* raises a UnicodeTranslateError */
3152static void raise_translate_exception(PyObject **exceptionObject,
3153 const Py_UNICODE *unicode, int size,
3154 int startpos, int endpos,
3155 const char *reason)
3156{
3157 make_translate_exception(exceptionObject,
3158 unicode, size, startpos, endpos, reason);
3159 if (*exceptionObject != NULL)
3160 PyCodec_StrictErrors(*exceptionObject);
3161}
3162
3163/* error handling callback helper:
3164 build arguments, call the callback and check the arguments,
3165 put the result into newpos and return the replacement string, which
3166 has to be freed by the caller */
3167static PyObject *unicode_translate_call_errorhandler(const char *errors,
3168 PyObject **errorHandler,
3169 const char *reason,
3170 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3171 int startpos, int endpos,
3172 int *newpos)
3173{
3174 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3175
3176 PyObject *restuple;
3177 PyObject *resunicode;
3178
3179 if (*errorHandler == NULL) {
3180 *errorHandler = PyCodec_LookupError(errors);
3181 if (*errorHandler == NULL)
3182 return NULL;
3183 }
3184
3185 make_translate_exception(exceptionObject,
3186 unicode, size, startpos, endpos, reason);
3187 if (*exceptionObject == NULL)
3188 return NULL;
3189
3190 restuple = PyObject_CallFunctionObjArgs(
3191 *errorHandler, *exceptionObject, NULL);
3192 if (restuple == NULL)
3193 return NULL;
3194 if (!PyTuple_Check(restuple)) {
3195 PyErr_Format(PyExc_TypeError, &argparse[4]);
3196 Py_DECREF(restuple);
3197 return NULL;
3198 }
3199 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3200 &resunicode, newpos)) {
3201 Py_DECREF(restuple);
3202 return NULL;
3203 }
3204 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003205 *newpos = size+*newpos;
3206 if (*newpos<0 || *newpos>size) {
3207 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3208 Py_DECREF(restuple);
3209 return NULL;
3210 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003211 Py_INCREF(resunicode);
3212 Py_DECREF(restuple);
3213 return resunicode;
3214}
3215
3216/* Lookup the character ch in the mapping and put the result in result,
3217 which must be decrefed by the caller.
3218 Return 0 on success, -1 on error */
3219static
3220int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3221{
3222 PyObject *w = PyInt_FromLong((long)c);
3223 PyObject *x;
3224
3225 if (w == NULL)
3226 return -1;
3227 x = PyObject_GetItem(mapping, w);
3228 Py_DECREF(w);
3229 if (x == NULL) {
3230 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3231 /* No mapping found means: use 1:1 mapping. */
3232 PyErr_Clear();
3233 *result = NULL;
3234 return 0;
3235 } else
3236 return -1;
3237 }
3238 else if (x == Py_None) {
3239 *result = x;
3240 return 0;
3241 }
3242 else if (PyInt_Check(x)) {
3243 long value = PyInt_AS_LONG(x);
3244 long max = PyUnicode_GetMax();
3245 if (value < 0 || value > max) {
3246 PyErr_Format(PyExc_TypeError,
3247 "character mapping must be in range(0x%lx)", max+1);
3248 Py_DECREF(x);
3249 return -1;
3250 }
3251 *result = x;
3252 return 0;
3253 }
3254 else if (PyUnicode_Check(x)) {
3255 *result = x;
3256 return 0;
3257 }
3258 else {
3259 /* wrong return value */
3260 PyErr_SetString(PyExc_TypeError,
3261 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003262 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003263 return -1;
3264 }
3265}
3266/* ensure that *outobj is at least requiredsize characters long,
3267if not reallocate and adjust various state variables.
3268Return 0 on success, -1 on error */
3269static
Walter Dörwald4894c302003-10-24 14:25:28 +00003270int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003271 int requiredsize)
3272{
Walter Dörwald4894c302003-10-24 14:25:28 +00003273 int oldsize = PyUnicode_GET_SIZE(*outobj);
3274 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003275 /* remember old output position */
3276 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3277 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003278 if (requiredsize < 2 * oldsize)
3279 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003280 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003281 return -1;
3282 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003283 }
3284 return 0;
3285}
3286/* lookup the character, put the result in the output string and adjust
3287 various state variables. Return a new reference to the object that
3288 was put in the output buffer in *result, or Py_None, if the mapping was
3289 undefined (in which case no character was written).
3290 The called must decref result.
3291 Return 0 on success, -1 on error. */
3292static
Walter Dörwald4894c302003-10-24 14:25:28 +00003293int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3294 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3295 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296{
Walter Dörwald4894c302003-10-24 14:25:28 +00003297 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003298 return -1;
3299 if (*res==NULL) {
3300 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003301 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003302 }
3303 else if (*res==Py_None)
3304 ;
3305 else if (PyInt_Check(*res)) {
3306 /* no overflow check, because we know that the space is enough */
3307 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3308 }
3309 else if (PyUnicode_Check(*res)) {
3310 int repsize = PyUnicode_GET_SIZE(*res);
3311 if (repsize==1) {
3312 /* no overflow check, because we know that the space is enough */
3313 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3314 }
3315 else if (repsize!=0) {
3316 /* more than one character */
Walter Dörwald4894c302003-10-24 14:25:28 +00003317 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003318 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003319 repsize - 1;
3320 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003321 return -1;
3322 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3323 *outp += repsize;
3324 }
3325 }
3326 else
3327 return -1;
3328 return 0;
3329}
3330
3331PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 int size,
3333 PyObject *mapping,
3334 const char *errors)
3335{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003336 /* output object */
3337 PyObject *res = NULL;
3338 /* pointers to the beginning and end+1 of input */
3339 const Py_UNICODE *startp = p;
3340 const Py_UNICODE *endp = p + size;
3341 /* pointer into the output */
3342 Py_UNICODE *str;
3343 /* current output position */
3344 int respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003345 char *reason = "character maps to <undefined>";
3346 PyObject *errorHandler = NULL;
3347 PyObject *exc = NULL;
3348 /* the following variable is used for caching string comparisons
3349 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3350 * 3=ignore, 4=xmlcharrefreplace */
3351 int known_errorHandler = -1;
3352
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 if (mapping == NULL) {
3354 PyErr_BadArgument();
3355 return NULL;
3356 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003357
3358 /* allocate enough for a simple 1:1 translation without
3359 replacements, if we need more, we'll resize */
3360 res = PyUnicode_FromUnicode(NULL, size);
3361 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003362 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003363 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003364 return res;
3365 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003367 while (p<endp) {
3368 /* try to encode it */
3369 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003370 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003371 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372 goto onError;
3373 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003374 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003375 if (x!=Py_None) /* it worked => adjust input pointer */
3376 ++p;
3377 else { /* untranslatable character */
3378 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3379 int repsize;
3380 int newpos;
3381 Py_UNICODE *uni2;
3382 /* startpos for collecting untranslatable chars */
3383 const Py_UNICODE *collstart = p;
3384 const Py_UNICODE *collend = p+1;
3385 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003386
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003387 /* find all untranslatable characters */
3388 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003389 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390 goto onError;
3391 Py_XDECREF(x);
3392 if (x!=Py_None)
3393 break;
3394 ++collend;
3395 }
3396 /* cache callback name lookup
3397 * (if not done yet, i.e. it's the first error) */
3398 if (known_errorHandler==-1) {
3399 if ((errors==NULL) || (!strcmp(errors, "strict")))
3400 known_errorHandler = 1;
3401 else if (!strcmp(errors, "replace"))
3402 known_errorHandler = 2;
3403 else if (!strcmp(errors, "ignore"))
3404 known_errorHandler = 3;
3405 else if (!strcmp(errors, "xmlcharrefreplace"))
3406 known_errorHandler = 4;
3407 else
3408 known_errorHandler = 0;
3409 }
3410 switch (known_errorHandler) {
3411 case 1: /* strict */
3412 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3413 goto onError;
3414 case 2: /* replace */
3415 /* No need to check for space, this is a 1:1 replacement */
3416 for (coll = collstart; coll<collend; ++coll)
3417 *str++ = '?';
3418 /* fall through */
3419 case 3: /* ignore */
3420 p = collend;
3421 break;
3422 case 4: /* xmlcharrefreplace */
3423 /* generate replacement (temporarily (mis)uses p) */
3424 for (p = collstart; p < collend; ++p) {
3425 char buffer[2+29+1+1];
3426 char *cp;
3427 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003428 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3430 goto onError;
3431 for (cp = buffer; *cp; ++cp)
3432 *str++ = *cp;
3433 }
3434 p = collend;
3435 break;
3436 default:
3437 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3438 reason, startp, size, &exc,
3439 collstart-startp, collend-startp, &newpos);
3440 if (repunicode == NULL)
3441 goto onError;
3442 /* generate replacement */
3443 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003444 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3446 Py_DECREF(repunicode);
3447 goto onError;
3448 }
3449 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3450 *str++ = *uni2;
3451 p = startp + newpos;
3452 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003453 }
3454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003456 /* Resize if we allocated to much */
3457 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003458 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003459 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003460 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003461 }
3462 Py_XDECREF(exc);
3463 Py_XDECREF(errorHandler);
3464 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466 onError:
3467 Py_XDECREF(res);
3468 Py_XDECREF(exc);
3469 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 return NULL;
3471}
3472
3473PyObject *PyUnicode_Translate(PyObject *str,
3474 PyObject *mapping,
3475 const char *errors)
3476{
3477 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003478
Guido van Rossumd57fd912000-03-10 22:53:23 +00003479 str = PyUnicode_FromObject(str);
3480 if (str == NULL)
3481 goto onError;
3482 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3483 PyUnicode_GET_SIZE(str),
3484 mapping,
3485 errors);
3486 Py_DECREF(str);
3487 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003488
Guido van Rossumd57fd912000-03-10 22:53:23 +00003489 onError:
3490 Py_XDECREF(str);
3491 return NULL;
3492}
Tim Petersced69f82003-09-16 20:30:58 +00003493
Guido van Rossum9e896b32000-04-05 20:11:21 +00003494/* --- Decimal Encoder ---------------------------------------------------- */
3495
3496int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3497 int length,
3498 char *output,
3499 const char *errors)
3500{
3501 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502 PyObject *errorHandler = NULL;
3503 PyObject *exc = NULL;
3504 const char *encoding = "decimal";
3505 const char *reason = "invalid decimal Unicode string";
3506 /* the following variable is used for caching string comparisons
3507 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3508 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003509
3510 if (output == NULL) {
3511 PyErr_BadArgument();
3512 return -1;
3513 }
3514
3515 p = s;
3516 end = s + length;
3517 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003519 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520 PyObject *repunicode;
3521 int repsize;
3522 int newpos;
3523 Py_UNICODE *uni2;
3524 Py_UNICODE *collstart;
3525 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003526
Guido van Rossum9e896b32000-04-05 20:11:21 +00003527 if (Py_UNICODE_ISSPACE(ch)) {
3528 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003530 continue;
3531 }
3532 decimal = Py_UNICODE_TODECIMAL(ch);
3533 if (decimal >= 0) {
3534 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003536 continue;
3537 }
Guido van Rossumba477042000-04-06 18:18:10 +00003538 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003539 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003540 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003541 continue;
3542 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 /* All other characters are considered unencodable */
3544 collstart = p;
3545 collend = p+1;
3546 while (collend < end) {
3547 if ((0 < *collend && *collend < 256) ||
3548 !Py_UNICODE_ISSPACE(*collend) ||
3549 Py_UNICODE_TODECIMAL(*collend))
3550 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003551 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003552 /* cache callback name lookup
3553 * (if not done yet, i.e. it's the first error) */
3554 if (known_errorHandler==-1) {
3555 if ((errors==NULL) || (!strcmp(errors, "strict")))
3556 known_errorHandler = 1;
3557 else if (!strcmp(errors, "replace"))
3558 known_errorHandler = 2;
3559 else if (!strcmp(errors, "ignore"))
3560 known_errorHandler = 3;
3561 else if (!strcmp(errors, "xmlcharrefreplace"))
3562 known_errorHandler = 4;
3563 else
3564 known_errorHandler = 0;
3565 }
3566 switch (known_errorHandler) {
3567 case 1: /* strict */
3568 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3569 goto onError;
3570 case 2: /* replace */
3571 for (p = collstart; p < collend; ++p)
3572 *output++ = '?';
3573 /* fall through */
3574 case 3: /* ignore */
3575 p = collend;
3576 break;
3577 case 4: /* xmlcharrefreplace */
3578 /* generate replacement (temporarily (mis)uses p) */
3579 for (p = collstart; p < collend; ++p)
3580 output += sprintf(output, "&#%d;", (int)*p);
3581 p = collend;
3582 break;
3583 default:
3584 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3585 encoding, reason, s, length, &exc,
3586 collstart-s, collend-s, &newpos);
3587 if (repunicode == NULL)
3588 goto onError;
3589 /* generate replacement */
3590 repsize = PyUnicode_GET_SIZE(repunicode);
3591 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3592 Py_UNICODE ch = *uni2;
3593 if (Py_UNICODE_ISSPACE(ch))
3594 *output++ = ' ';
3595 else {
3596 decimal = Py_UNICODE_TODECIMAL(ch);
3597 if (decimal >= 0)
3598 *output++ = '0' + decimal;
3599 else if (0 < ch && ch < 256)
3600 *output++ = (char)ch;
3601 else {
3602 Py_DECREF(repunicode);
3603 raise_encode_exception(&exc, encoding,
3604 s, length, collstart-s, collend-s, reason);
3605 goto onError;
3606 }
3607 }
3608 }
3609 p = s + newpos;
3610 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003611 }
3612 }
3613 /* 0-terminate the output string */
3614 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003615 Py_XDECREF(exc);
3616 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003617 return 0;
3618
3619 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 Py_XDECREF(exc);
3621 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003622 return -1;
3623}
3624
Guido van Rossumd57fd912000-03-10 22:53:23 +00003625/* --- Helpers ------------------------------------------------------------ */
3626
Tim Petersced69f82003-09-16 20:30:58 +00003627static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628int count(PyUnicodeObject *self,
3629 int start,
3630 int end,
3631 PyUnicodeObject *substring)
3632{
3633 int count = 0;
3634
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003635 if (start < 0)
3636 start += self->length;
3637 if (start < 0)
3638 start = 0;
3639 if (end > self->length)
3640 end = self->length;
3641 if (end < 0)
3642 end += self->length;
3643 if (end < 0)
3644 end = 0;
3645
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003646 if (substring->length == 0)
3647 return (end - start + 1);
3648
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649 end -= substring->length;
3650
3651 while (start <= end)
3652 if (Py_UNICODE_MATCH(self, start, substring)) {
3653 count++;
3654 start += substring->length;
3655 } else
3656 start++;
3657
3658 return count;
3659}
3660
3661int PyUnicode_Count(PyObject *str,
3662 PyObject *substr,
3663 int start,
3664 int end)
3665{
3666 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003667
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 str = PyUnicode_FromObject(str);
3669 if (str == NULL)
3670 return -1;
3671 substr = PyUnicode_FromObject(substr);
3672 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003673 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674 return -1;
3675 }
Tim Petersced69f82003-09-16 20:30:58 +00003676
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 result = count((PyUnicodeObject *)str,
3678 start, end,
3679 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003680
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681 Py_DECREF(str);
3682 Py_DECREF(substr);
3683 return result;
3684}
3685
Tim Petersced69f82003-09-16 20:30:58 +00003686static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687int findstring(PyUnicodeObject *self,
3688 PyUnicodeObject *substring,
3689 int start,
3690 int end,
3691 int direction)
3692{
3693 if (start < 0)
3694 start += self->length;
3695 if (start < 0)
3696 start = 0;
3697
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698 if (end > self->length)
3699 end = self->length;
3700 if (end < 0)
3701 end += self->length;
3702 if (end < 0)
3703 end = 0;
3704
Guido van Rossum76afbd92002-08-20 17:29:29 +00003705 if (substring->length == 0)
3706 return (direction > 0) ? start : end;
3707
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 end -= substring->length;
3709
3710 if (direction < 0) {
3711 for (; end >= start; end--)
3712 if (Py_UNICODE_MATCH(self, end, substring))
3713 return end;
3714 } else {
3715 for (; start <= end; start++)
3716 if (Py_UNICODE_MATCH(self, start, substring))
3717 return start;
3718 }
3719
3720 return -1;
3721}
3722
3723int PyUnicode_Find(PyObject *str,
3724 PyObject *substr,
3725 int start,
3726 int end,
3727 int direction)
3728{
3729 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003730
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 str = PyUnicode_FromObject(str);
3732 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003733 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003734 substr = PyUnicode_FromObject(substr);
3735 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003736 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003737 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738 }
Tim Petersced69f82003-09-16 20:30:58 +00003739
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740 result = findstring((PyUnicodeObject *)str,
3741 (PyUnicodeObject *)substr,
3742 start, end, direction);
3743 Py_DECREF(str);
3744 Py_DECREF(substr);
3745 return result;
3746}
3747
Tim Petersced69f82003-09-16 20:30:58 +00003748static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749int tailmatch(PyUnicodeObject *self,
3750 PyUnicodeObject *substring,
3751 int start,
3752 int end,
3753 int direction)
3754{
3755 if (start < 0)
3756 start += self->length;
3757 if (start < 0)
3758 start = 0;
3759
3760 if (substring->length == 0)
3761 return 1;
3762
3763 if (end > self->length)
3764 end = self->length;
3765 if (end < 0)
3766 end += self->length;
3767 if (end < 0)
3768 end = 0;
3769
3770 end -= substring->length;
3771 if (end < start)
3772 return 0;
3773
3774 if (direction > 0) {
3775 if (Py_UNICODE_MATCH(self, end, substring))
3776 return 1;
3777 } else {
3778 if (Py_UNICODE_MATCH(self, start, substring))
3779 return 1;
3780 }
3781
3782 return 0;
3783}
3784
3785int PyUnicode_Tailmatch(PyObject *str,
3786 PyObject *substr,
3787 int start,
3788 int end,
3789 int direction)
3790{
3791 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003792
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793 str = PyUnicode_FromObject(str);
3794 if (str == NULL)
3795 return -1;
3796 substr = PyUnicode_FromObject(substr);
3797 if (substr == NULL) {
3798 Py_DECREF(substr);
3799 return -1;
3800 }
Tim Petersced69f82003-09-16 20:30:58 +00003801
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802 result = tailmatch((PyUnicodeObject *)str,
3803 (PyUnicodeObject *)substr,
3804 start, end, direction);
3805 Py_DECREF(str);
3806 Py_DECREF(substr);
3807 return result;
3808}
3809
Tim Petersced69f82003-09-16 20:30:58 +00003810static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811const Py_UNICODE *findchar(const Py_UNICODE *s,
3812 int size,
3813 Py_UNICODE ch)
3814{
3815 /* like wcschr, but doesn't stop at NULL characters */
3816
3817 while (size-- > 0) {
3818 if (*s == ch)
3819 return s;
3820 s++;
3821 }
3822
3823 return NULL;
3824}
3825
3826/* Apply fixfct filter to the Unicode object self and return a
3827 reference to the modified object */
3828
Tim Petersced69f82003-09-16 20:30:58 +00003829static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830PyObject *fixup(PyUnicodeObject *self,
3831 int (*fixfct)(PyUnicodeObject *s))
3832{
3833
3834 PyUnicodeObject *u;
3835
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003836 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 if (u == NULL)
3838 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003839
3840 Py_UNICODE_COPY(u->str, self->str, self->length);
3841
Tim Peters7a29bd52001-09-12 03:03:31 +00003842 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843 /* fixfct should return TRUE if it modified the buffer. If
3844 FALSE, return a reference to the original buffer instead
3845 (to save space, not time) */
3846 Py_INCREF(self);
3847 Py_DECREF(u);
3848 return (PyObject*) self;
3849 }
3850 return (PyObject*) u;
3851}
3852
Tim Petersced69f82003-09-16 20:30:58 +00003853static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854int fixupper(PyUnicodeObject *self)
3855{
3856 int len = self->length;
3857 Py_UNICODE *s = self->str;
3858 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003859
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860 while (len-- > 0) {
3861 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003862
Guido van Rossumd57fd912000-03-10 22:53:23 +00003863 ch = Py_UNICODE_TOUPPER(*s);
3864 if (ch != *s) {
3865 status = 1;
3866 *s = ch;
3867 }
3868 s++;
3869 }
3870
3871 return status;
3872}
3873
Tim Petersced69f82003-09-16 20:30:58 +00003874static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875int fixlower(PyUnicodeObject *self)
3876{
3877 int len = self->length;
3878 Py_UNICODE *s = self->str;
3879 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003880
Guido van Rossumd57fd912000-03-10 22:53:23 +00003881 while (len-- > 0) {
3882 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003883
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884 ch = Py_UNICODE_TOLOWER(*s);
3885 if (ch != *s) {
3886 status = 1;
3887 *s = ch;
3888 }
3889 s++;
3890 }
3891
3892 return status;
3893}
3894
Tim Petersced69f82003-09-16 20:30:58 +00003895static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896int fixswapcase(PyUnicodeObject *self)
3897{
3898 int len = self->length;
3899 Py_UNICODE *s = self->str;
3900 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003901
Guido van Rossumd57fd912000-03-10 22:53:23 +00003902 while (len-- > 0) {
3903 if (Py_UNICODE_ISUPPER(*s)) {
3904 *s = Py_UNICODE_TOLOWER(*s);
3905 status = 1;
3906 } else if (Py_UNICODE_ISLOWER(*s)) {
3907 *s = Py_UNICODE_TOUPPER(*s);
3908 status = 1;
3909 }
3910 s++;
3911 }
3912
3913 return status;
3914}
3915
Tim Petersced69f82003-09-16 20:30:58 +00003916static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917int fixcapitalize(PyUnicodeObject *self)
3918{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003919 int len = self->length;
3920 Py_UNICODE *s = self->str;
3921 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003922
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003923 if (len == 0)
3924 return 0;
3925 if (Py_UNICODE_ISLOWER(*s)) {
3926 *s = Py_UNICODE_TOUPPER(*s);
3927 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003929 s++;
3930 while (--len > 0) {
3931 if (Py_UNICODE_ISUPPER(*s)) {
3932 *s = Py_UNICODE_TOLOWER(*s);
3933 status = 1;
3934 }
3935 s++;
3936 }
3937 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938}
3939
3940static
3941int fixtitle(PyUnicodeObject *self)
3942{
3943 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3944 register Py_UNICODE *e;
3945 int previous_is_cased;
3946
3947 /* Shortcut for single character strings */
3948 if (PyUnicode_GET_SIZE(self) == 1) {
3949 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3950 if (*p != ch) {
3951 *p = ch;
3952 return 1;
3953 }
3954 else
3955 return 0;
3956 }
Tim Petersced69f82003-09-16 20:30:58 +00003957
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 e = p + PyUnicode_GET_SIZE(self);
3959 previous_is_cased = 0;
3960 for (; p < e; p++) {
3961 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00003962
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963 if (previous_is_cased)
3964 *p = Py_UNICODE_TOLOWER(ch);
3965 else
3966 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00003967
3968 if (Py_UNICODE_ISLOWER(ch) ||
3969 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970 Py_UNICODE_ISTITLE(ch))
3971 previous_is_cased = 1;
3972 else
3973 previous_is_cased = 0;
3974 }
3975 return 1;
3976}
3977
3978PyObject *PyUnicode_Join(PyObject *separator,
3979 PyObject *seq)
3980{
3981 Py_UNICODE *sep;
3982 int seplen;
3983 PyUnicodeObject *res = NULL;
3984 int reslen = 0;
3985 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986 int sz = 100;
3987 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003988 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989
Tim Peters2cfe3682001-05-05 05:36:48 +00003990 it = PyObject_GetIter(seq);
3991 if (it == NULL)
3992 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993
3994 if (separator == NULL) {
3995 Py_UNICODE blank = ' ';
3996 sep = &blank;
3997 seplen = 1;
3998 }
3999 else {
4000 separator = PyUnicode_FromObject(separator);
4001 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00004002 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003 sep = PyUnicode_AS_UNICODE(separator);
4004 seplen = PyUnicode_GET_SIZE(separator);
4005 }
Tim Petersced69f82003-09-16 20:30:58 +00004006
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007 res = _PyUnicode_New(sz);
4008 if (res == NULL)
4009 goto onError;
4010 p = PyUnicode_AS_UNICODE(res);
4011 reslen = 0;
4012
Tim Peters2cfe3682001-05-05 05:36:48 +00004013 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00004015 PyObject *item = PyIter_Next(it);
4016 if (item == NULL) {
4017 if (PyErr_Occurred())
4018 goto onError;
4019 break;
4020 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021 if (!PyUnicode_Check(item)) {
4022 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004023 if (!PyString_Check(item)) {
4024 PyErr_Format(PyExc_TypeError,
4025 "sequence item %i: expected string or Unicode,"
4026 " %.80s found",
4027 i, item->ob_type->tp_name);
4028 Py_DECREF(item);
4029 goto onError;
4030 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031 v = PyUnicode_FromObject(item);
4032 Py_DECREF(item);
4033 item = v;
4034 if (item == NULL)
4035 goto onError;
4036 }
4037 itemlen = PyUnicode_GET_SIZE(item);
4038 while (reslen + itemlen + seplen >= sz) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004039 if (_PyUnicode_Resize(&res, sz*2) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004040 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004041 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004042 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004043 sz *= 2;
4044 p = PyUnicode_AS_UNICODE(res) + reslen;
4045 }
4046 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004047 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048 p += seplen;
4049 reslen += seplen;
4050 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004051 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052 p += itemlen;
4053 reslen += itemlen;
4054 Py_DECREF(item);
4055 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004056 if (_PyUnicode_Resize(&res, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057 goto onError;
4058
4059 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004060 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061 return (PyObject *)res;
4062
4063 onError:
4064 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004065 Py_XDECREF(res);
4066 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067 return NULL;
4068}
4069
Tim Petersced69f82003-09-16 20:30:58 +00004070static
4071PyUnicodeObject *pad(PyUnicodeObject *self,
4072 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 int right,
4074 Py_UNICODE fill)
4075{
4076 PyUnicodeObject *u;
4077
4078 if (left < 0)
4079 left = 0;
4080 if (right < 0)
4081 right = 0;
4082
Tim Peters7a29bd52001-09-12 03:03:31 +00004083 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084 Py_INCREF(self);
4085 return self;
4086 }
4087
4088 u = _PyUnicode_New(left + self->length + right);
4089 if (u) {
4090 if (left)
4091 Py_UNICODE_FILL(u->str, fill, left);
4092 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4093 if (right)
4094 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4095 }
4096
4097 return u;
4098}
4099
4100#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004101 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102 if (!str) \
4103 goto onError; \
4104 if (PyList_Append(list, str)) { \
4105 Py_DECREF(str); \
4106 goto onError; \
4107 } \
4108 else \
4109 Py_DECREF(str);
4110
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004111#define SPLIT_INSERT(data, left, right) \
4112 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4113 if (!str) \
4114 goto onError; \
4115 if (PyList_Insert(list, 0, str)) { \
4116 Py_DECREF(str); \
4117 goto onError; \
4118 } \
4119 else \
4120 Py_DECREF(str);
4121
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122static
4123PyObject *split_whitespace(PyUnicodeObject *self,
4124 PyObject *list,
4125 int maxcount)
4126{
4127 register int i;
4128 register int j;
4129 int len = self->length;
4130 PyObject *str;
4131
4132 for (i = j = 0; i < len; ) {
4133 /* find a token */
4134 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4135 i++;
4136 j = i;
4137 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4138 i++;
4139 if (j < i) {
4140 if (maxcount-- <= 0)
4141 break;
4142 SPLIT_APPEND(self->str, j, i);
4143 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4144 i++;
4145 j = i;
4146 }
4147 }
4148 if (j < len) {
4149 SPLIT_APPEND(self->str, j, len);
4150 }
4151 return list;
4152
4153 onError:
4154 Py_DECREF(list);
4155 return NULL;
4156}
4157
4158PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004159 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004160{
4161 register int i;
4162 register int j;
4163 int len;
4164 PyObject *list;
4165 PyObject *str;
4166 Py_UNICODE *data;
4167
4168 string = PyUnicode_FromObject(string);
4169 if (string == NULL)
4170 return NULL;
4171 data = PyUnicode_AS_UNICODE(string);
4172 len = PyUnicode_GET_SIZE(string);
4173
Guido van Rossumd57fd912000-03-10 22:53:23 +00004174 list = PyList_New(0);
4175 if (!list)
4176 goto onError;
4177
4178 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004179 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004180
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181 /* Find a line and append it */
4182 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4183 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184
4185 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004186 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004187 if (i < len) {
4188 if (data[i] == '\r' && i + 1 < len &&
4189 data[i+1] == '\n')
4190 i += 2;
4191 else
4192 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004193 if (keepends)
4194 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004195 }
Guido van Rossum86662912000-04-11 15:38:46 +00004196 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197 j = i;
4198 }
4199 if (j < len) {
4200 SPLIT_APPEND(data, j, len);
4201 }
4202
4203 Py_DECREF(string);
4204 return list;
4205
4206 onError:
4207 Py_DECREF(list);
4208 Py_DECREF(string);
4209 return NULL;
4210}
4211
Tim Petersced69f82003-09-16 20:30:58 +00004212static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004213PyObject *split_char(PyUnicodeObject *self,
4214 PyObject *list,
4215 Py_UNICODE ch,
4216 int maxcount)
4217{
4218 register int i;
4219 register int j;
4220 int len = self->length;
4221 PyObject *str;
4222
4223 for (i = j = 0; i < len; ) {
4224 if (self->str[i] == ch) {
4225 if (maxcount-- <= 0)
4226 break;
4227 SPLIT_APPEND(self->str, j, i);
4228 i = j = i + 1;
4229 } else
4230 i++;
4231 }
4232 if (j <= len) {
4233 SPLIT_APPEND(self->str, j, len);
4234 }
4235 return list;
4236
4237 onError:
4238 Py_DECREF(list);
4239 return NULL;
4240}
4241
Tim Petersced69f82003-09-16 20:30:58 +00004242static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004243PyObject *split_substring(PyUnicodeObject *self,
4244 PyObject *list,
4245 PyUnicodeObject *substring,
4246 int maxcount)
4247{
4248 register int i;
4249 register int j;
4250 int len = self->length;
4251 int sublen = substring->length;
4252 PyObject *str;
4253
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004254 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004255 if (Py_UNICODE_MATCH(self, i, substring)) {
4256 if (maxcount-- <= 0)
4257 break;
4258 SPLIT_APPEND(self->str, j, i);
4259 i = j = i + sublen;
4260 } else
4261 i++;
4262 }
4263 if (j <= len) {
4264 SPLIT_APPEND(self->str, j, len);
4265 }
4266 return list;
4267
4268 onError:
4269 Py_DECREF(list);
4270 return NULL;
4271}
4272
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004273static
4274PyObject *rsplit_whitespace(PyUnicodeObject *self,
4275 PyObject *list,
4276 int maxcount)
4277{
4278 register int i;
4279 register int j;
4280 int len = self->length;
4281 PyObject *str;
4282
4283 for (i = j = len - 1; i >= 0; ) {
4284 /* find a token */
4285 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4286 i--;
4287 j = i;
4288 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4289 i--;
4290 if (j > i) {
4291 if (maxcount-- <= 0)
4292 break;
4293 SPLIT_INSERT(self->str, i + 1, j + 1);
4294 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4295 i--;
4296 j = i;
4297 }
4298 }
4299 if (j >= 0) {
4300 SPLIT_INSERT(self->str, 0, j + 1);
4301 }
4302 return list;
4303
4304 onError:
4305 Py_DECREF(list);
4306 return NULL;
4307}
4308
4309static
4310PyObject *rsplit_char(PyUnicodeObject *self,
4311 PyObject *list,
4312 Py_UNICODE ch,
4313 int maxcount)
4314{
4315 register int i;
4316 register int j;
4317 int len = self->length;
4318 PyObject *str;
4319
4320 for (i = j = len - 1; i >= 0; ) {
4321 if (self->str[i] == ch) {
4322 if (maxcount-- <= 0)
4323 break;
4324 SPLIT_INSERT(self->str, i + 1, j + 1);
4325 j = i = i - 1;
4326 } else
4327 i--;
4328 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004329 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004330 SPLIT_INSERT(self->str, 0, j + 1);
4331 }
4332 return list;
4333
4334 onError:
4335 Py_DECREF(list);
4336 return NULL;
4337}
4338
4339static
4340PyObject *rsplit_substring(PyUnicodeObject *self,
4341 PyObject *list,
4342 PyUnicodeObject *substring,
4343 int maxcount)
4344{
4345 register int i;
4346 register int j;
4347 int len = self->length;
4348 int sublen = substring->length;
4349 PyObject *str;
4350
4351 for (i = len - sublen, j = len; i >= 0; ) {
4352 if (Py_UNICODE_MATCH(self, i, substring)) {
4353 if (maxcount-- <= 0)
4354 break;
4355 SPLIT_INSERT(self->str, i + sublen, j);
4356 j = i;
4357 i -= sublen;
4358 } else
4359 i--;
4360 }
4361 if (j >= 0) {
4362 SPLIT_INSERT(self->str, 0, j);
4363 }
4364 return list;
4365
4366 onError:
4367 Py_DECREF(list);
4368 return NULL;
4369}
4370
Guido van Rossumd57fd912000-03-10 22:53:23 +00004371#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004372#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004373
4374static
4375PyObject *split(PyUnicodeObject *self,
4376 PyUnicodeObject *substring,
4377 int maxcount)
4378{
4379 PyObject *list;
4380
4381 if (maxcount < 0)
4382 maxcount = INT_MAX;
4383
4384 list = PyList_New(0);
4385 if (!list)
4386 return NULL;
4387
4388 if (substring == NULL)
4389 return split_whitespace(self,list,maxcount);
4390
4391 else if (substring->length == 1)
4392 return split_char(self,list,substring->str[0],maxcount);
4393
4394 else if (substring->length == 0) {
4395 Py_DECREF(list);
4396 PyErr_SetString(PyExc_ValueError, "empty separator");
4397 return NULL;
4398 }
4399 else
4400 return split_substring(self,list,substring,maxcount);
4401}
4402
Tim Petersced69f82003-09-16 20:30:58 +00004403static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004404PyObject *rsplit(PyUnicodeObject *self,
4405 PyUnicodeObject *substring,
4406 int maxcount)
4407{
4408 PyObject *list;
4409
4410 if (maxcount < 0)
4411 maxcount = INT_MAX;
4412
4413 list = PyList_New(0);
4414 if (!list)
4415 return NULL;
4416
4417 if (substring == NULL)
4418 return rsplit_whitespace(self,list,maxcount);
4419
4420 else if (substring->length == 1)
4421 return rsplit_char(self,list,substring->str[0],maxcount);
4422
4423 else if (substring->length == 0) {
4424 Py_DECREF(list);
4425 PyErr_SetString(PyExc_ValueError, "empty separator");
4426 return NULL;
4427 }
4428 else
4429 return rsplit_substring(self,list,substring,maxcount);
4430}
4431
4432static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433PyObject *replace(PyUnicodeObject *self,
4434 PyUnicodeObject *str1,
4435 PyUnicodeObject *str2,
4436 int maxcount)
4437{
4438 PyUnicodeObject *u;
4439
4440 if (maxcount < 0)
4441 maxcount = INT_MAX;
4442
4443 if (str1->length == 1 && str2->length == 1) {
4444 int i;
4445
4446 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004447 if (!findchar(self->str, self->length, str1->str[0]) &&
4448 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449 /* nothing to replace, return original string */
4450 Py_INCREF(self);
4451 u = self;
4452 } else {
4453 Py_UNICODE u1 = str1->str[0];
4454 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004455
Guido van Rossumd57fd912000-03-10 22:53:23 +00004456 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004457 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 self->length
4459 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004460 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004461 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004462 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 for (i = 0; i < u->length; i++)
4464 if (u->str[i] == u1) {
4465 if (--maxcount < 0)
4466 break;
4467 u->str[i] = u2;
4468 }
4469 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004470 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471
4472 } else {
4473 int n, i;
4474 Py_UNICODE *p;
4475
4476 /* replace strings */
4477 n = count(self, 0, self->length, str1);
4478 if (n > maxcount)
4479 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004480 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004482 if (PyUnicode_CheckExact(self)) {
4483 Py_INCREF(self);
4484 u = self;
4485 }
4486 else {
4487 u = (PyUnicodeObject *)
4488 PyUnicode_FromUnicode(self->str, self->length);
4489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 } else {
4491 u = _PyUnicode_New(
4492 self->length + n * (str2->length - str1->length));
4493 if (u) {
4494 i = 0;
4495 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004496 if (str1->length > 0) {
4497 while (i <= self->length - str1->length)
4498 if (Py_UNICODE_MATCH(self, i, str1)) {
4499 /* replace string segment */
4500 Py_UNICODE_COPY(p, str2->str, str2->length);
4501 p += str2->length;
4502 i += str1->length;
4503 if (--n <= 0) {
4504 /* copy remaining part */
4505 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4506 break;
4507 }
4508 } else
4509 *p++ = self->str[i++];
4510 } else {
4511 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512 Py_UNICODE_COPY(p, str2->str, str2->length);
4513 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004514 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004517 }
4518 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520 }
4521 }
4522 }
Tim Petersced69f82003-09-16 20:30:58 +00004523
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524 return (PyObject *) u;
4525}
4526
4527/* --- Unicode Object Methods --------------------------------------------- */
4528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004529PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530"S.title() -> unicode\n\
4531\n\
4532Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004533characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534
4535static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004536unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004537{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004538 return fixup(self, fixtitle);
4539}
4540
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004541PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004542"S.capitalize() -> unicode\n\
4543\n\
4544Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004545have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004546
4547static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004548unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004549{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550 return fixup(self, fixcapitalize);
4551}
4552
4553#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004554PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555"S.capwords() -> unicode\n\
4556\n\
4557Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004558normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559
4560static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004561unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562{
4563 PyObject *list;
4564 PyObject *item;
4565 int i;
4566
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567 /* Split into words */
4568 list = split(self, NULL, -1);
4569 if (!list)
4570 return NULL;
4571
4572 /* Capitalize each word */
4573 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4574 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4575 fixcapitalize);
4576 if (item == NULL)
4577 goto onError;
4578 Py_DECREF(PyList_GET_ITEM(list, i));
4579 PyList_SET_ITEM(list, i, item);
4580 }
4581
4582 /* Join the words to form a new string */
4583 item = PyUnicode_Join(NULL, list);
4584
4585onError:
4586 Py_DECREF(list);
4587 return (PyObject *)item;
4588}
4589#endif
4590
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004591/* Argument converter. Coerces to a single unicode character */
4592
4593static int
4594convert_uc(PyObject *obj, void *addr)
4595{
4596 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4597 PyObject *uniobj;
4598 Py_UNICODE *unistr;
4599
4600 uniobj = PyUnicode_FromObject(obj);
4601 if (uniobj == NULL) {
4602 PyErr_SetString(PyExc_TypeError,
4603 "The fill character cannot be converted to Unicode");
4604 return 0;
4605 }
4606 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4607 PyErr_SetString(PyExc_TypeError,
4608 "The fill character must be exactly one character long");
4609 Py_DECREF(uniobj);
4610 return 0;
4611 }
4612 unistr = PyUnicode_AS_UNICODE(uniobj);
4613 *fillcharloc = unistr[0];
4614 Py_DECREF(uniobj);
4615 return 1;
4616}
4617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004618PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004619"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004620\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004621Return S centered in a Unicode string of length width. Padding is\n\
4622done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623
4624static PyObject *
4625unicode_center(PyUnicodeObject *self, PyObject *args)
4626{
4627 int marg, left;
4628 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004629 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004631 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004632 return NULL;
4633
Tim Peters7a29bd52001-09-12 03:03:31 +00004634 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635 Py_INCREF(self);
4636 return (PyObject*) self;
4637 }
4638
4639 marg = width - self->length;
4640 left = marg / 2 + (marg & width & 1);
4641
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004642 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004643}
4644
Marc-André Lemburge5034372000-08-08 08:04:29 +00004645#if 0
4646
4647/* This code should go into some future Unicode collation support
4648 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004649 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004650
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004651/* speedy UTF-16 code point order comparison */
4652/* gleaned from: */
4653/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4654
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004655static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004656{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004657 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004658 0, 0, 0, 0, 0, 0, 0, 0,
4659 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004660 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004661};
4662
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663static int
4664unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4665{
4666 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004667
Guido van Rossumd57fd912000-03-10 22:53:23 +00004668 Py_UNICODE *s1 = str1->str;
4669 Py_UNICODE *s2 = str2->str;
4670
4671 len1 = str1->length;
4672 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004673
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004675 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004676
4677 c1 = *s1++;
4678 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004679
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004680 if (c1 > (1<<11) * 26)
4681 c1 += utf16Fixup[c1>>11];
4682 if (c2 > (1<<11) * 26)
4683 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004684 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004685
4686 if (c1 != c2)
4687 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004688
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004689 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004690 }
4691
4692 return (len1 < len2) ? -1 : (len1 != len2);
4693}
4694
Marc-André Lemburge5034372000-08-08 08:04:29 +00004695#else
4696
4697static int
4698unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4699{
4700 register int len1, len2;
4701
4702 Py_UNICODE *s1 = str1->str;
4703 Py_UNICODE *s2 = str2->str;
4704
4705 len1 = str1->length;
4706 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004707
Marc-André Lemburge5034372000-08-08 08:04:29 +00004708 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004709 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004710
Fredrik Lundh45714e92001-06-26 16:39:36 +00004711 c1 = *s1++;
4712 c2 = *s2++;
4713
4714 if (c1 != c2)
4715 return (c1 < c2) ? -1 : 1;
4716
Marc-André Lemburge5034372000-08-08 08:04:29 +00004717 len1--; len2--;
4718 }
4719
4720 return (len1 < len2) ? -1 : (len1 != len2);
4721}
4722
4723#endif
4724
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725int PyUnicode_Compare(PyObject *left,
4726 PyObject *right)
4727{
4728 PyUnicodeObject *u = NULL, *v = NULL;
4729 int result;
4730
4731 /* Coerce the two arguments */
4732 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4733 if (u == NULL)
4734 goto onError;
4735 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4736 if (v == NULL)
4737 goto onError;
4738
Thomas Wouters7e474022000-07-16 12:04:32 +00004739 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 if (v == u) {
4741 Py_DECREF(u);
4742 Py_DECREF(v);
4743 return 0;
4744 }
4745
4746 result = unicode_compare(u, v);
4747
4748 Py_DECREF(u);
4749 Py_DECREF(v);
4750 return result;
4751
4752onError:
4753 Py_XDECREF(u);
4754 Py_XDECREF(v);
4755 return -1;
4756}
4757
Guido van Rossum403d68b2000-03-13 15:55:09 +00004758int PyUnicode_Contains(PyObject *container,
4759 PyObject *element)
4760{
4761 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004762 int result, size;
4763 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004764
4765 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004766 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004767 if (v == NULL) {
4768 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004769 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004770 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004771 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004772 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004773 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004774 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004775
Barry Warsaw817918c2002-08-06 16:58:21 +00004776 size = PyUnicode_GET_SIZE(v);
4777 rhs = PyUnicode_AS_UNICODE(v);
4778 lhs = PyUnicode_AS_UNICODE(u);
4779
Guido van Rossum403d68b2000-03-13 15:55:09 +00004780 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004781 if (size == 1) {
4782 end = lhs + PyUnicode_GET_SIZE(u);
4783 while (lhs < end) {
4784 if (*lhs++ == *rhs) {
4785 result = 1;
4786 break;
4787 }
4788 }
4789 }
4790 else {
4791 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4792 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004793 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004794 result = 1;
4795 break;
4796 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004797 }
4798 }
4799
4800 Py_DECREF(u);
4801 Py_DECREF(v);
4802 return result;
4803
4804onError:
4805 Py_XDECREF(u);
4806 Py_XDECREF(v);
4807 return -1;
4808}
4809
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810/* Concat to string or Unicode object giving a new Unicode object. */
4811
4812PyObject *PyUnicode_Concat(PyObject *left,
4813 PyObject *right)
4814{
4815 PyUnicodeObject *u = NULL, *v = NULL, *w;
4816
4817 /* Coerce the two arguments */
4818 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4819 if (u == NULL)
4820 goto onError;
4821 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4822 if (v == NULL)
4823 goto onError;
4824
4825 /* Shortcuts */
4826 if (v == unicode_empty) {
4827 Py_DECREF(v);
4828 return (PyObject *)u;
4829 }
4830 if (u == unicode_empty) {
4831 Py_DECREF(u);
4832 return (PyObject *)v;
4833 }
4834
4835 /* Concat the two Unicode strings */
4836 w = _PyUnicode_New(u->length + v->length);
4837 if (w == NULL)
4838 goto onError;
4839 Py_UNICODE_COPY(w->str, u->str, u->length);
4840 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4841
4842 Py_DECREF(u);
4843 Py_DECREF(v);
4844 return (PyObject *)w;
4845
4846onError:
4847 Py_XDECREF(u);
4848 Py_XDECREF(v);
4849 return NULL;
4850}
4851
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004852PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853"S.count(sub[, start[, end]]) -> int\n\
4854\n\
4855Return the number of occurrences of substring sub in Unicode string\n\
4856S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004857interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858
4859static PyObject *
4860unicode_count(PyUnicodeObject *self, PyObject *args)
4861{
4862 PyUnicodeObject *substring;
4863 int start = 0;
4864 int end = INT_MAX;
4865 PyObject *result;
4866
Guido van Rossumb8872e62000-05-09 14:14:27 +00004867 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4868 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869 return NULL;
4870
4871 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4872 (PyObject *)substring);
4873 if (substring == NULL)
4874 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004875
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 if (start < 0)
4877 start += self->length;
4878 if (start < 0)
4879 start = 0;
4880 if (end > self->length)
4881 end = self->length;
4882 if (end < 0)
4883 end += self->length;
4884 if (end < 0)
4885 end = 0;
4886
4887 result = PyInt_FromLong((long) count(self, start, end, substring));
4888
4889 Py_DECREF(substring);
4890 return result;
4891}
4892
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004893PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004894"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004896Encodes S using the codec registered for encoding. encoding defaults\n\
4897to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004898handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004899a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4900'xmlcharrefreplace' as well as any other name registered with\n\
4901codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902
4903static PyObject *
4904unicode_encode(PyUnicodeObject *self, PyObject *args)
4905{
4906 char *encoding = NULL;
4907 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004908 PyObject *v;
4909
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4911 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004912 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00004913 if (v == NULL)
4914 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004915 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
4916 PyErr_Format(PyExc_TypeError,
4917 "encoder did not return a string/unicode object "
4918 "(type=%.400s)",
4919 v->ob_type->tp_name);
4920 Py_DECREF(v);
4921 return NULL;
4922 }
4923 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00004924
4925 onError:
4926 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004927}
4928
4929PyDoc_STRVAR(decode__doc__,
4930"S.decode([encoding[,errors]]) -> string or unicode\n\
4931\n\
4932Decodes S using the codec registered for encoding. encoding defaults\n\
4933to the default encoding. errors may be given to set a different error\n\
4934handling scheme. Default is 'strict' meaning that encoding errors raise\n\
4935a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
4936as well as any other name registerd with codecs.register_error that is\n\
4937able to handle UnicodeDecodeErrors.");
4938
4939static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00004940unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004941{
4942 char *encoding = NULL;
4943 char *errors = NULL;
4944 PyObject *v;
4945
4946 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
4947 return NULL;
4948 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00004949 if (v == NULL)
4950 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004951 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
4952 PyErr_Format(PyExc_TypeError,
4953 "decoder did not return a string/unicode object "
4954 "(type=%.400s)",
4955 v->ob_type->tp_name);
4956 Py_DECREF(v);
4957 return NULL;
4958 }
4959 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00004960
4961 onError:
4962 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963}
4964
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004965PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966"S.expandtabs([tabsize]) -> unicode\n\
4967\n\
4968Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004969If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004970
4971static PyObject*
4972unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4973{
4974 Py_UNICODE *e;
4975 Py_UNICODE *p;
4976 Py_UNICODE *q;
4977 int i, j;
4978 PyUnicodeObject *u;
4979 int tabsize = 8;
4980
4981 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4982 return NULL;
4983
Thomas Wouters7e474022000-07-16 12:04:32 +00004984 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004985 i = j = 0;
4986 e = self->str + self->length;
4987 for (p = self->str; p < e; p++)
4988 if (*p == '\t') {
4989 if (tabsize > 0)
4990 j += tabsize - (j % tabsize);
4991 }
4992 else {
4993 j++;
4994 if (*p == '\n' || *p == '\r') {
4995 i += j;
4996 j = 0;
4997 }
4998 }
4999
5000 /* Second pass: create output string and fill it */
5001 u = _PyUnicode_New(i + j);
5002 if (!u)
5003 return NULL;
5004
5005 j = 0;
5006 q = u->str;
5007
5008 for (p = self->str; p < e; p++)
5009 if (*p == '\t') {
5010 if (tabsize > 0) {
5011 i = tabsize - (j % tabsize);
5012 j += i;
5013 while (i--)
5014 *q++ = ' ';
5015 }
5016 }
5017 else {
5018 j++;
5019 *q++ = *p;
5020 if (*p == '\n' || *p == '\r')
5021 j = 0;
5022 }
5023
5024 return (PyObject*) u;
5025}
5026
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005027PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028"S.find(sub [,start [,end]]) -> int\n\
5029\n\
5030Return the lowest index in S where substring sub is found,\n\
5031such that sub is contained within s[start,end]. Optional\n\
5032arguments start and end are interpreted as in slice notation.\n\
5033\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005034Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035
5036static PyObject *
5037unicode_find(PyUnicodeObject *self, PyObject *args)
5038{
5039 PyUnicodeObject *substring;
5040 int start = 0;
5041 int end = INT_MAX;
5042 PyObject *result;
5043
Guido van Rossumb8872e62000-05-09 14:14:27 +00005044 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5045 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046 return NULL;
5047 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5048 (PyObject *)substring);
5049 if (substring == NULL)
5050 return NULL;
5051
5052 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
5053
5054 Py_DECREF(substring);
5055 return result;
5056}
5057
5058static PyObject *
5059unicode_getitem(PyUnicodeObject *self, int index)
5060{
5061 if (index < 0 || index >= self->length) {
5062 PyErr_SetString(PyExc_IndexError, "string index out of range");
5063 return NULL;
5064 }
5065
5066 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5067}
5068
5069static long
5070unicode_hash(PyUnicodeObject *self)
5071{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005072 /* Since Unicode objects compare equal to their ASCII string
5073 counterparts, they should use the individual character values
5074 as basis for their hash value. This is needed to assure that
5075 strings and Unicode objects behave in the same way as
5076 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005077
Fredrik Lundhdde61642000-07-10 18:27:47 +00005078 register int len;
5079 register Py_UNICODE *p;
5080 register long x;
5081
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082 if (self->hash != -1)
5083 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005084 len = PyUnicode_GET_SIZE(self);
5085 p = PyUnicode_AS_UNICODE(self);
5086 x = *p << 7;
5087 while (--len >= 0)
5088 x = (1000003*x) ^ *p++;
5089 x ^= PyUnicode_GET_SIZE(self);
5090 if (x == -1)
5091 x = -2;
5092 self->hash = x;
5093 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005094}
5095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005096PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097"S.index(sub [,start [,end]]) -> int\n\
5098\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005099Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005100
5101static PyObject *
5102unicode_index(PyUnicodeObject *self, PyObject *args)
5103{
5104 int result;
5105 PyUnicodeObject *substring;
5106 int start = 0;
5107 int end = INT_MAX;
5108
Guido van Rossumb8872e62000-05-09 14:14:27 +00005109 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5110 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005112
Guido van Rossumd57fd912000-03-10 22:53:23 +00005113 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5114 (PyObject *)substring);
5115 if (substring == NULL)
5116 return NULL;
5117
5118 result = findstring(self, substring, start, end, 1);
5119
5120 Py_DECREF(substring);
5121 if (result < 0) {
5122 PyErr_SetString(PyExc_ValueError, "substring not found");
5123 return NULL;
5124 }
5125 return PyInt_FromLong(result);
5126}
5127
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005128PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005129"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005131Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005132at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133
5134static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005135unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136{
5137 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5138 register const Py_UNICODE *e;
5139 int cased;
5140
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141 /* Shortcut for single character strings */
5142 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005143 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005145 /* Special case for empty strings */
5146 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005147 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005148
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 e = p + PyUnicode_GET_SIZE(self);
5150 cased = 0;
5151 for (; p < e; p++) {
5152 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005153
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005155 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156 else if (!cased && Py_UNICODE_ISLOWER(ch))
5157 cased = 1;
5158 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005159 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005160}
5161
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005162PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005163"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005165Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005166at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167
5168static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005169unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170{
5171 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5172 register const Py_UNICODE *e;
5173 int cased;
5174
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175 /* Shortcut for single character strings */
5176 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005177 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005179 /* Special case for empty strings */
5180 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005181 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005182
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 e = p + PyUnicode_GET_SIZE(self);
5184 cased = 0;
5185 for (; p < e; p++) {
5186 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005187
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005189 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190 else if (!cased && Py_UNICODE_ISUPPER(ch))
5191 cased = 1;
5192 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005193 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194}
5195
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005196PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005197"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005199Return True if S is a titlecased string and there is at least one\n\
5200character in S, i.e. upper- and titlecase characters may only\n\
5201follow uncased characters and lowercase characters only cased ones.\n\
5202Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203
5204static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005205unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206{
5207 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5208 register const Py_UNICODE *e;
5209 int cased, previous_is_cased;
5210
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211 /* Shortcut for single character strings */
5212 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005213 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5214 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005216 /* Special case for empty strings */
5217 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005218 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005219
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220 e = p + PyUnicode_GET_SIZE(self);
5221 cased = 0;
5222 previous_is_cased = 0;
5223 for (; p < e; p++) {
5224 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005225
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5227 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005228 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 previous_is_cased = 1;
5230 cased = 1;
5231 }
5232 else if (Py_UNICODE_ISLOWER(ch)) {
5233 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005234 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235 previous_is_cased = 1;
5236 cased = 1;
5237 }
5238 else
5239 previous_is_cased = 0;
5240 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005241 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242}
5243
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005244PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005245"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005247Return True if all characters in S are whitespace\n\
5248and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249
5250static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005251unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252{
5253 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5254 register const Py_UNICODE *e;
5255
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256 /* Shortcut for single character strings */
5257 if (PyUnicode_GET_SIZE(self) == 1 &&
5258 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005259 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005261 /* Special case for empty strings */
5262 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005263 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005264
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265 e = p + PyUnicode_GET_SIZE(self);
5266 for (; p < e; p++) {
5267 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005268 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005270 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271}
5272
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005273PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005274"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005275\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005276Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005277and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005278
5279static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005280unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005281{
5282 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5283 register const Py_UNICODE *e;
5284
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005285 /* Shortcut for single character strings */
5286 if (PyUnicode_GET_SIZE(self) == 1 &&
5287 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005288 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005289
5290 /* Special case for empty strings */
5291 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005292 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005293
5294 e = p + PyUnicode_GET_SIZE(self);
5295 for (; p < e; p++) {
5296 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005297 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005298 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005299 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005300}
5301
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005302PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005303"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005304\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005305Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005306and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005307
5308static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005309unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005310{
5311 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5312 register const Py_UNICODE *e;
5313
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005314 /* Shortcut for single character strings */
5315 if (PyUnicode_GET_SIZE(self) == 1 &&
5316 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005317 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005318
5319 /* Special case for empty strings */
5320 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005321 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005322
5323 e = p + PyUnicode_GET_SIZE(self);
5324 for (; p < e; p++) {
5325 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005326 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005327 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005328 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005329}
5330
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005331PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005332"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005334Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005335False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336
5337static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005338unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339{
5340 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5341 register const Py_UNICODE *e;
5342
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343 /* Shortcut for single character strings */
5344 if (PyUnicode_GET_SIZE(self) == 1 &&
5345 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005346 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005348 /* Special case for empty strings */
5349 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005350 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005351
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352 e = p + PyUnicode_GET_SIZE(self);
5353 for (; p < e; p++) {
5354 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005355 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005357 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358}
5359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005360PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005361"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005363Return True if all characters in S are digits\n\
5364and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365
5366static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005367unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368{
5369 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5370 register const Py_UNICODE *e;
5371
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 /* Shortcut for single character strings */
5373 if (PyUnicode_GET_SIZE(self) == 1 &&
5374 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005375 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005377 /* Special case for empty strings */
5378 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005379 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005380
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 e = p + PyUnicode_GET_SIZE(self);
5382 for (; p < e; p++) {
5383 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005384 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005386 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387}
5388
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005389PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005390"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005392Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005393False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394
5395static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005396unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397{
5398 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5399 register const Py_UNICODE *e;
5400
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 /* Shortcut for single character strings */
5402 if (PyUnicode_GET_SIZE(self) == 1 &&
5403 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005404 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005406 /* Special case for empty strings */
5407 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005408 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005409
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 e = p + PyUnicode_GET_SIZE(self);
5411 for (; p < e; p++) {
5412 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005413 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005415 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416}
5417
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005418PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419"S.join(sequence) -> unicode\n\
5420\n\
5421Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005422sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423
5424static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005425unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005427 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428}
5429
5430static int
5431unicode_length(PyUnicodeObject *self)
5432{
5433 return self->length;
5434}
5435
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005436PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005437"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438\n\
5439Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005440done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441
5442static PyObject *
5443unicode_ljust(PyUnicodeObject *self, PyObject *args)
5444{
5445 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005446 Py_UNICODE fillchar = ' ';
5447
5448 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 return NULL;
5450
Tim Peters7a29bd52001-09-12 03:03:31 +00005451 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 Py_INCREF(self);
5453 return (PyObject*) self;
5454 }
5455
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005456 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457}
5458
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005459PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460"S.lower() -> unicode\n\
5461\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005462Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463
5464static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005465unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 return fixup(self, fixlower);
5468}
5469
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005470#define LEFTSTRIP 0
5471#define RIGHTSTRIP 1
5472#define BOTHSTRIP 2
5473
5474/* Arrays indexed by above */
5475static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5476
5477#define STRIPNAME(i) (stripformat[i]+3)
5478
5479static const Py_UNICODE *
5480unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5481{
Tim Peters030a5ce2002-04-22 19:00:10 +00005482 size_t i;
5483 for (i = 0; i < n; ++i)
5484 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005485 return s+i;
5486 return NULL;
5487}
5488
5489/* externally visible for str.strip(unicode) */
5490PyObject *
5491_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5492{
5493 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5494 int len = PyUnicode_GET_SIZE(self);
5495 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5496 int seplen = PyUnicode_GET_SIZE(sepobj);
5497 int i, j;
5498
5499 i = 0;
5500 if (striptype != RIGHTSTRIP) {
5501 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5502 i++;
5503 }
5504 }
5505
5506 j = len;
5507 if (striptype != LEFTSTRIP) {
5508 do {
5509 j--;
5510 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5511 j++;
5512 }
5513
5514 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5515 Py_INCREF(self);
5516 return (PyObject*)self;
5517 }
5518 else
5519 return PyUnicode_FromUnicode(s+i, j-i);
5520}
5521
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522
5523static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005524do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005526 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5527 int len = PyUnicode_GET_SIZE(self), i, j;
5528
5529 i = 0;
5530 if (striptype != RIGHTSTRIP) {
5531 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5532 i++;
5533 }
5534 }
5535
5536 j = len;
5537 if (striptype != LEFTSTRIP) {
5538 do {
5539 j--;
5540 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5541 j++;
5542 }
5543
5544 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5545 Py_INCREF(self);
5546 return (PyObject*)self;
5547 }
5548 else
5549 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550}
5551
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005552
5553static PyObject *
5554do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5555{
5556 PyObject *sep = NULL;
5557
5558 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5559 return NULL;
5560
5561 if (sep != NULL && sep != Py_None) {
5562 if (PyUnicode_Check(sep))
5563 return _PyUnicode_XStrip(self, striptype, sep);
5564 else if (PyString_Check(sep)) {
5565 PyObject *res;
5566 sep = PyUnicode_FromObject(sep);
5567 if (sep==NULL)
5568 return NULL;
5569 res = _PyUnicode_XStrip(self, striptype, sep);
5570 Py_DECREF(sep);
5571 return res;
5572 }
5573 else {
5574 PyErr_Format(PyExc_TypeError,
5575 "%s arg must be None, unicode or str",
5576 STRIPNAME(striptype));
5577 return NULL;
5578 }
5579 }
5580
5581 return do_strip(self, striptype);
5582}
5583
5584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005585PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005586"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005587\n\
5588Return a copy of the string S with leading and trailing\n\
5589whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005590If chars is given and not None, remove characters in chars instead.\n\
5591If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005592
5593static PyObject *
5594unicode_strip(PyUnicodeObject *self, PyObject *args)
5595{
5596 if (PyTuple_GET_SIZE(args) == 0)
5597 return do_strip(self, BOTHSTRIP); /* Common case */
5598 else
5599 return do_argstrip(self, BOTHSTRIP, args);
5600}
5601
5602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005603PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005604"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005605\n\
5606Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005607If chars is given and not None, remove characters in chars instead.\n\
5608If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005609
5610static PyObject *
5611unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5612{
5613 if (PyTuple_GET_SIZE(args) == 0)
5614 return do_strip(self, LEFTSTRIP); /* Common case */
5615 else
5616 return do_argstrip(self, LEFTSTRIP, args);
5617}
5618
5619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005620PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005621"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005622\n\
5623Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005624If chars is given and not None, remove characters in chars instead.\n\
5625If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005626
5627static PyObject *
5628unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5629{
5630 if (PyTuple_GET_SIZE(args) == 0)
5631 return do_strip(self, RIGHTSTRIP); /* Common case */
5632 else
5633 return do_argstrip(self, RIGHTSTRIP, args);
5634}
5635
5636
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637static PyObject*
5638unicode_repeat(PyUnicodeObject *str, int len)
5639{
5640 PyUnicodeObject *u;
5641 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005642 int nchars;
5643 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644
5645 if (len < 0)
5646 len = 0;
5647
Tim Peters7a29bd52001-09-12 03:03:31 +00005648 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649 /* no repeat, return original string */
5650 Py_INCREF(str);
5651 return (PyObject*) str;
5652 }
Tim Peters8f422462000-09-09 06:13:41 +00005653
5654 /* ensure # of chars needed doesn't overflow int and # of bytes
5655 * needed doesn't overflow size_t
5656 */
5657 nchars = len * str->length;
5658 if (len && nchars / len != str->length) {
5659 PyErr_SetString(PyExc_OverflowError,
5660 "repeated string is too long");
5661 return NULL;
5662 }
5663 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5664 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5665 PyErr_SetString(PyExc_OverflowError,
5666 "repeated string is too long");
5667 return NULL;
5668 }
5669 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 if (!u)
5671 return NULL;
5672
5673 p = u->str;
5674
5675 while (len-- > 0) {
5676 Py_UNICODE_COPY(p, str->str, str->length);
5677 p += str->length;
5678 }
5679
5680 return (PyObject*) u;
5681}
5682
5683PyObject *PyUnicode_Replace(PyObject *obj,
5684 PyObject *subobj,
5685 PyObject *replobj,
5686 int maxcount)
5687{
5688 PyObject *self;
5689 PyObject *str1;
5690 PyObject *str2;
5691 PyObject *result;
5692
5693 self = PyUnicode_FromObject(obj);
5694 if (self == NULL)
5695 return NULL;
5696 str1 = PyUnicode_FromObject(subobj);
5697 if (str1 == NULL) {
5698 Py_DECREF(self);
5699 return NULL;
5700 }
5701 str2 = PyUnicode_FromObject(replobj);
5702 if (str2 == NULL) {
5703 Py_DECREF(self);
5704 Py_DECREF(str1);
5705 return NULL;
5706 }
Tim Petersced69f82003-09-16 20:30:58 +00005707 result = replace((PyUnicodeObject *)self,
5708 (PyUnicodeObject *)str1,
5709 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 maxcount);
5711 Py_DECREF(self);
5712 Py_DECREF(str1);
5713 Py_DECREF(str2);
5714 return result;
5715}
5716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005717PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718"S.replace (old, new[, maxsplit]) -> unicode\n\
5719\n\
5720Return a copy of S with all occurrences of substring\n\
5721old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005722given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723
5724static PyObject*
5725unicode_replace(PyUnicodeObject *self, PyObject *args)
5726{
5727 PyUnicodeObject *str1;
5728 PyUnicodeObject *str2;
5729 int maxcount = -1;
5730 PyObject *result;
5731
5732 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5733 return NULL;
5734 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5735 if (str1 == NULL)
5736 return NULL;
5737 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005738 if (str2 == NULL) {
5739 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005741 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742
5743 result = replace(self, str1, str2, maxcount);
5744
5745 Py_DECREF(str1);
5746 Py_DECREF(str2);
5747 return result;
5748}
5749
5750static
5751PyObject *unicode_repr(PyObject *unicode)
5752{
5753 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5754 PyUnicode_GET_SIZE(unicode),
5755 1);
5756}
5757
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005758PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759"S.rfind(sub [,start [,end]]) -> int\n\
5760\n\
5761Return the highest index in S where substring sub is found,\n\
5762such that sub is contained within s[start,end]. Optional\n\
5763arguments start and end are interpreted as in slice notation.\n\
5764\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005765Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766
5767static PyObject *
5768unicode_rfind(PyUnicodeObject *self, PyObject *args)
5769{
5770 PyUnicodeObject *substring;
5771 int start = 0;
5772 int end = INT_MAX;
5773 PyObject *result;
5774
Guido van Rossumb8872e62000-05-09 14:14:27 +00005775 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5776 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777 return NULL;
5778 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5779 (PyObject *)substring);
5780 if (substring == NULL)
5781 return NULL;
5782
5783 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5784
5785 Py_DECREF(substring);
5786 return result;
5787}
5788
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005789PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790"S.rindex(sub [,start [,end]]) -> int\n\
5791\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005792Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793
5794static PyObject *
5795unicode_rindex(PyUnicodeObject *self, PyObject *args)
5796{
5797 int result;
5798 PyUnicodeObject *substring;
5799 int start = 0;
5800 int end = INT_MAX;
5801
Guido van Rossumb8872e62000-05-09 14:14:27 +00005802 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5803 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804 return NULL;
5805 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5806 (PyObject *)substring);
5807 if (substring == NULL)
5808 return NULL;
5809
5810 result = findstring(self, substring, start, end, -1);
5811
5812 Py_DECREF(substring);
5813 if (result < 0) {
5814 PyErr_SetString(PyExc_ValueError, "substring not found");
5815 return NULL;
5816 }
5817 return PyInt_FromLong(result);
5818}
5819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005820PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005821"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822\n\
5823Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005824done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825
5826static PyObject *
5827unicode_rjust(PyUnicodeObject *self, PyObject *args)
5828{
5829 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005830 Py_UNICODE fillchar = ' ';
5831
5832 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833 return NULL;
5834
Tim Peters7a29bd52001-09-12 03:03:31 +00005835 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 Py_INCREF(self);
5837 return (PyObject*) self;
5838 }
5839
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005840 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841}
5842
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843static PyObject*
5844unicode_slice(PyUnicodeObject *self, int start, int end)
5845{
5846 /* standard clamping */
5847 if (start < 0)
5848 start = 0;
5849 if (end < 0)
5850 end = 0;
5851 if (end > self->length)
5852 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005853 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 /* full slice, return original string */
5855 Py_INCREF(self);
5856 return (PyObject*) self;
5857 }
5858 if (start > end)
5859 start = end;
5860 /* copy slice */
5861 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5862 end - start);
5863}
5864
5865PyObject *PyUnicode_Split(PyObject *s,
5866 PyObject *sep,
5867 int maxsplit)
5868{
5869 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005870
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 s = PyUnicode_FromObject(s);
5872 if (s == NULL)
5873 return NULL;
5874 if (sep != NULL) {
5875 sep = PyUnicode_FromObject(sep);
5876 if (sep == NULL) {
5877 Py_DECREF(s);
5878 return NULL;
5879 }
5880 }
5881
5882 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5883
5884 Py_DECREF(s);
5885 Py_XDECREF(sep);
5886 return result;
5887}
5888
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005889PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890"S.split([sep [,maxsplit]]) -> list of strings\n\
5891\n\
5892Return a list of the words in S, using sep as the\n\
5893delimiter string. If maxsplit is given, at most maxsplit\n\
5894splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005895is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896
5897static PyObject*
5898unicode_split(PyUnicodeObject *self, PyObject *args)
5899{
5900 PyObject *substring = Py_None;
5901 int maxcount = -1;
5902
5903 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5904 return NULL;
5905
5906 if (substring == Py_None)
5907 return split(self, NULL, maxcount);
5908 else if (PyUnicode_Check(substring))
5909 return split(self, (PyUnicodeObject *)substring, maxcount);
5910 else
5911 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5912}
5913
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005914PyObject *PyUnicode_RSplit(PyObject *s,
5915 PyObject *sep,
5916 int maxsplit)
5917{
5918 PyObject *result;
5919
5920 s = PyUnicode_FromObject(s);
5921 if (s == NULL)
5922 return NULL;
5923 if (sep != NULL) {
5924 sep = PyUnicode_FromObject(sep);
5925 if (sep == NULL) {
5926 Py_DECREF(s);
5927 return NULL;
5928 }
5929 }
5930
5931 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5932
5933 Py_DECREF(s);
5934 Py_XDECREF(sep);
5935 return result;
5936}
5937
5938PyDoc_STRVAR(rsplit__doc__,
5939"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
5940\n\
5941Return a list of the words in S, using sep as the\n\
5942delimiter string, starting at the end of the string and\n\
5943working to the front. If maxsplit is given, at most maxsplit\n\
5944splits are done. If sep is not specified, any whitespace string\n\
5945is a separator.");
5946
5947static PyObject*
5948unicode_rsplit(PyUnicodeObject *self, PyObject *args)
5949{
5950 PyObject *substring = Py_None;
5951 int maxcount = -1;
5952
5953 if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
5954 return NULL;
5955
5956 if (substring == Py_None)
5957 return rsplit(self, NULL, maxcount);
5958 else if (PyUnicode_Check(substring))
5959 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
5960 else
5961 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
5962}
5963
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005964PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00005965"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966\n\
5967Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00005968Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005969is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970
5971static PyObject*
5972unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5973{
Guido van Rossum86662912000-04-11 15:38:46 +00005974 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975
Guido van Rossum86662912000-04-11 15:38:46 +00005976 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 return NULL;
5978
Guido van Rossum86662912000-04-11 15:38:46 +00005979 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980}
5981
5982static
5983PyObject *unicode_str(PyUnicodeObject *self)
5984{
Fred Drakee4315f52000-05-09 19:53:39 +00005985 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986}
5987
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005988PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989"S.swapcase() -> unicode\n\
5990\n\
5991Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005992and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993
5994static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005995unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 return fixup(self, fixswapcase);
5998}
5999
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006000PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001"S.translate(table) -> unicode\n\
6002\n\
6003Return a copy of the string S, where all characters have been mapped\n\
6004through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006005Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6006Unmapped characters are left untouched. Characters mapped to None\n\
6007are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008
6009static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006010unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011{
Tim Petersced69f82003-09-16 20:30:58 +00006012 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006014 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 "ignore");
6016}
6017
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006018PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019"S.upper() -> unicode\n\
6020\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006021Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022
6023static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006024unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 return fixup(self, fixupper);
6027}
6028
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006029PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030"S.zfill(width) -> unicode\n\
6031\n\
6032Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006033of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034
6035static PyObject *
6036unicode_zfill(PyUnicodeObject *self, PyObject *args)
6037{
6038 int fill;
6039 PyUnicodeObject *u;
6040
6041 int width;
6042 if (!PyArg_ParseTuple(args, "i:zfill", &width))
6043 return NULL;
6044
6045 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006046 if (PyUnicode_CheckExact(self)) {
6047 Py_INCREF(self);
6048 return (PyObject*) self;
6049 }
6050 else
6051 return PyUnicode_FromUnicode(
6052 PyUnicode_AS_UNICODE(self),
6053 PyUnicode_GET_SIZE(self)
6054 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 }
6056
6057 fill = width - self->length;
6058
6059 u = pad(self, fill, 0, '0');
6060
Walter Dörwald068325e2002-04-15 13:36:47 +00006061 if (u == NULL)
6062 return NULL;
6063
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 if (u->str[fill] == '+' || u->str[fill] == '-') {
6065 /* move sign to beginning of string */
6066 u->str[0] = u->str[fill];
6067 u->str[fill] = '0';
6068 }
6069
6070 return (PyObject*) u;
6071}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072
6073#if 0
6074static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006075unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 return PyInt_FromLong(unicode_freelist_size);
6078}
6079#endif
6080
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006081PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006082"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006084Return True if S starts with the specified prefix, False otherwise.\n\
6085With optional start, test S beginning at that position.\n\
6086With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087
6088static PyObject *
6089unicode_startswith(PyUnicodeObject *self,
6090 PyObject *args)
6091{
6092 PyUnicodeObject *substring;
6093 int start = 0;
6094 int end = INT_MAX;
6095 PyObject *result;
6096
Guido van Rossumb8872e62000-05-09 14:14:27 +00006097 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6098 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 return NULL;
6100 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6101 (PyObject *)substring);
6102 if (substring == NULL)
6103 return NULL;
6104
Guido van Rossum77f6a652002-04-03 22:41:51 +00006105 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106
6107 Py_DECREF(substring);
6108 return result;
6109}
6110
6111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006112PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006113"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006115Return True if S ends with the specified suffix, False otherwise.\n\
6116With optional start, test S beginning at that position.\n\
6117With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118
6119static PyObject *
6120unicode_endswith(PyUnicodeObject *self,
6121 PyObject *args)
6122{
6123 PyUnicodeObject *substring;
6124 int start = 0;
6125 int end = INT_MAX;
6126 PyObject *result;
6127
Guido van Rossumb8872e62000-05-09 14:14:27 +00006128 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6129 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130 return NULL;
6131 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6132 (PyObject *)substring);
6133 if (substring == NULL)
6134 return NULL;
6135
Guido van Rossum77f6a652002-04-03 22:41:51 +00006136 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137
6138 Py_DECREF(substring);
6139 return result;
6140}
6141
6142
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006143
6144static PyObject *
6145unicode_getnewargs(PyUnicodeObject *v)
6146{
6147 return Py_BuildValue("(u#)", v->str, v->length);
6148}
6149
6150
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151static PyMethodDef unicode_methods[] = {
6152
6153 /* Order is according to common usage: often used methods should
6154 appear first, since lookup is done sequentially. */
6155
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006156 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6157 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6158 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006159 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006160 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6161 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6162 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6163 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6164 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6165 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6166 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6167 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6168 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6169 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006170 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006171 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006172/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6173 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6174 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6175 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006176 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006177 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006178 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006179 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6180 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6181 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6182 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6183 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6184 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6185 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6186 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6187 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6188 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6189 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6190 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6191 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6192 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006193 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006194#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006195 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196#endif
6197
6198#if 0
6199 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006200 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201#endif
6202
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006203 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 {NULL, NULL}
6205};
6206
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006207static PyObject *
6208unicode_mod(PyObject *v, PyObject *w)
6209{
6210 if (!PyUnicode_Check(v)) {
6211 Py_INCREF(Py_NotImplemented);
6212 return Py_NotImplemented;
6213 }
6214 return PyUnicode_Format(v, w);
6215}
6216
6217static PyNumberMethods unicode_as_number = {
6218 0, /*nb_add*/
6219 0, /*nb_subtract*/
6220 0, /*nb_multiply*/
6221 0, /*nb_divide*/
6222 unicode_mod, /*nb_remainder*/
6223};
6224
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225static PySequenceMethods unicode_as_sequence = {
6226 (inquiry) unicode_length, /* sq_length */
6227 (binaryfunc) PyUnicode_Concat, /* sq_concat */
6228 (intargfunc) unicode_repeat, /* sq_repeat */
6229 (intargfunc) unicode_getitem, /* sq_item */
6230 (intintargfunc) unicode_slice, /* sq_slice */
6231 0, /* sq_ass_item */
6232 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00006233 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234};
6235
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006236static PyObject*
6237unicode_subscript(PyUnicodeObject* self, PyObject* item)
6238{
6239 if (PyInt_Check(item)) {
6240 long i = PyInt_AS_LONG(item);
6241 if (i < 0)
6242 i += PyString_GET_SIZE(self);
6243 return unicode_getitem(self, i);
6244 } else if (PyLong_Check(item)) {
6245 long i = PyLong_AsLong(item);
6246 if (i == -1 && PyErr_Occurred())
6247 return NULL;
6248 if (i < 0)
6249 i += PyString_GET_SIZE(self);
6250 return unicode_getitem(self, i);
6251 } else if (PySlice_Check(item)) {
6252 int start, stop, step, slicelength, cur, i;
6253 Py_UNICODE* source_buf;
6254 Py_UNICODE* result_buf;
6255 PyObject* result;
6256
6257 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
6258 &start, &stop, &step, &slicelength) < 0) {
6259 return NULL;
6260 }
6261
6262 if (slicelength <= 0) {
6263 return PyUnicode_FromUnicode(NULL, 0);
6264 } else {
6265 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6266 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
6267
6268 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6269 result_buf[i] = source_buf[cur];
6270 }
Tim Petersced69f82003-09-16 20:30:58 +00006271
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006272 result = PyUnicode_FromUnicode(result_buf, slicelength);
6273 PyMem_FREE(result_buf);
6274 return result;
6275 }
6276 } else {
6277 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6278 return NULL;
6279 }
6280}
6281
6282static PyMappingMethods unicode_as_mapping = {
6283 (inquiry)unicode_length, /* mp_length */
6284 (binaryfunc)unicode_subscript, /* mp_subscript */
6285 (objobjargproc)0, /* mp_ass_subscript */
6286};
6287
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288static int
6289unicode_buffer_getreadbuf(PyUnicodeObject *self,
6290 int index,
6291 const void **ptr)
6292{
6293 if (index != 0) {
6294 PyErr_SetString(PyExc_SystemError,
6295 "accessing non-existent unicode segment");
6296 return -1;
6297 }
6298 *ptr = (void *) self->str;
6299 return PyUnicode_GET_DATA_SIZE(self);
6300}
6301
6302static int
6303unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6304 const void **ptr)
6305{
6306 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006307 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308 return -1;
6309}
6310
6311static int
6312unicode_buffer_getsegcount(PyUnicodeObject *self,
6313 int *lenp)
6314{
6315 if (lenp)
6316 *lenp = PyUnicode_GET_DATA_SIZE(self);
6317 return 1;
6318}
6319
6320static int
6321unicode_buffer_getcharbuf(PyUnicodeObject *self,
6322 int index,
6323 const void **ptr)
6324{
6325 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006326
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327 if (index != 0) {
6328 PyErr_SetString(PyExc_SystemError,
6329 "accessing non-existent unicode segment");
6330 return -1;
6331 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006332 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333 if (str == NULL)
6334 return -1;
6335 *ptr = (void *) PyString_AS_STRING(str);
6336 return PyString_GET_SIZE(str);
6337}
6338
6339/* Helpers for PyUnicode_Format() */
6340
6341static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006342getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343{
6344 int argidx = *p_argidx;
6345 if (argidx < arglen) {
6346 (*p_argidx)++;
6347 if (arglen < 0)
6348 return args;
6349 else
6350 return PyTuple_GetItem(args, argidx);
6351 }
6352 PyErr_SetString(PyExc_TypeError,
6353 "not enough arguments for format string");
6354 return NULL;
6355}
6356
6357#define F_LJUST (1<<0)
6358#define F_SIGN (1<<1)
6359#define F_BLANK (1<<2)
6360#define F_ALT (1<<3)
6361#define F_ZERO (1<<4)
6362
6363static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365{
6366 register int i;
6367 int len;
6368 va_list va;
6369 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371
6372 /* First, format the string as char array, then expand to Py_UNICODE
6373 array. */
6374 charbuffer = (char *)buffer;
6375 len = vsprintf(charbuffer, format, va);
6376 for (i = len - 1; i >= 0; i--)
6377 buffer[i] = (Py_UNICODE) charbuffer[i];
6378
6379 va_end(va);
6380 return len;
6381}
6382
Guido van Rossum078151d2002-08-11 04:24:12 +00006383/* XXX To save some code duplication, formatfloat/long/int could have been
6384 shared with stringobject.c, converting from 8-bit to Unicode after the
6385 formatting is done. */
6386
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387static int
6388formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006389 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390 int flags,
6391 int prec,
6392 int type,
6393 PyObject *v)
6394{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006395 /* fmt = '%#.' + `prec` + `type`
6396 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397 char fmt[20];
6398 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006399
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400 x = PyFloat_AsDouble(v);
6401 if (x == -1.0 && PyErr_Occurred())
6402 return -1;
6403 if (prec < 0)
6404 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6406 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006407 /* Worst case length calc to ensure no buffer overrun:
6408
6409 'g' formats:
6410 fmt = %#.<prec>g
6411 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6412 for any double rep.)
6413 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6414
6415 'f' formats:
6416 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6417 len = 1 + 50 + 1 + prec = 52 + prec
6418
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006419 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006420 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006421
6422 */
6423 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6424 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006425 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006426 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006427 return -1;
6428 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006429 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6430 (flags&F_ALT) ? "#" : "",
6431 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432 return usprintf(buf, fmt, x);
6433}
6434
Tim Peters38fd5b62000-09-21 05:43:11 +00006435static PyObject*
6436formatlong(PyObject *val, int flags, int prec, int type)
6437{
6438 char *buf;
6439 int i, len;
6440 PyObject *str; /* temporary string object. */
6441 PyUnicodeObject *result;
6442
6443 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6444 if (!str)
6445 return NULL;
6446 result = _PyUnicode_New(len);
6447 for (i = 0; i < len; i++)
6448 result->str[i] = buf[i];
6449 result->str[len] = 0;
6450 Py_DECREF(str);
6451 return (PyObject*)result;
6452}
6453
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454static int
6455formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006456 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 int flags,
6458 int prec,
6459 int type,
6460 PyObject *v)
6461{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006462 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006463 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6464 * + 1 + 1
6465 * = 24
6466 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006467 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006468 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 long x;
6470
6471 x = PyInt_AsLong(v);
6472 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006473 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006474 if (x < 0 && type == 'u') {
6475 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006476 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006477 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6478 sign = "-";
6479 else
6480 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006482 prec = 1;
6483
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006484 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6485 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006486 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006487 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006488 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006489 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006490 return -1;
6491 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006492
6493 if ((flags & F_ALT) &&
6494 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006495 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006496 * of issues that cause pain:
6497 * - when 0 is being converted, the C standard leaves off
6498 * the '0x' or '0X', which is inconsistent with other
6499 * %#x/%#X conversions and inconsistent with Python's
6500 * hex() function
6501 * - there are platforms that violate the standard and
6502 * convert 0 with the '0x' or '0X'
6503 * (Metrowerks, Compaq Tru64)
6504 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006505 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006506 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006507 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006508 * We can achieve the desired consistency by inserting our
6509 * own '0x' or '0X' prefix, and substituting %x/%X in place
6510 * of %#x/%#X.
6511 *
6512 * Note that this is the same approach as used in
6513 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006514 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006515 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6516 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006517 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006518 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006519 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6520 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006521 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006522 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006523 if (sign[0])
6524 return usprintf(buf, fmt, -x);
6525 else
6526 return usprintf(buf, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527}
6528
6529static int
6530formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006531 size_t buflen,
6532 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006534 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006535 if (PyUnicode_Check(v)) {
6536 if (PyUnicode_GET_SIZE(v) != 1)
6537 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006539 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006541 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006542 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006543 goto onError;
6544 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6545 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546
6547 else {
6548 /* Integer input truncated to a character */
6549 long x;
6550 x = PyInt_AsLong(v);
6551 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006552 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006553#ifdef Py_UNICODE_WIDE
6554 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006555 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006556 "%c arg not in range(0x110000) "
6557 "(wide Python build)");
6558 return -1;
6559 }
6560#else
6561 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006562 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006563 "%c arg not in range(0x10000) "
6564 "(narrow Python build)");
6565 return -1;
6566 }
6567#endif
6568 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 }
6570 buf[1] = '\0';
6571 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006572
6573 onError:
6574 PyErr_SetString(PyExc_TypeError,
6575 "%c requires int or char");
6576 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577}
6578
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006579/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6580
6581 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6582 chars are formatted. XXX This is a magic number. Each formatting
6583 routine does bounds checking to ensure no overflow, but a better
6584 solution may be to malloc a buffer of appropriate size for each
6585 format. For now, the current solution is sufficient.
6586*/
6587#define FORMATBUFLEN (size_t)120
6588
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589PyObject *PyUnicode_Format(PyObject *format,
6590 PyObject *args)
6591{
6592 Py_UNICODE *fmt, *res;
6593 int fmtcnt, rescnt, reslen, arglen, argidx;
6594 int args_owned = 0;
6595 PyUnicodeObject *result = NULL;
6596 PyObject *dict = NULL;
6597 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006598
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 if (format == NULL || args == NULL) {
6600 PyErr_BadInternalCall();
6601 return NULL;
6602 }
6603 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006604 if (uformat == NULL)
6605 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 fmt = PyUnicode_AS_UNICODE(uformat);
6607 fmtcnt = PyUnicode_GET_SIZE(uformat);
6608
6609 reslen = rescnt = fmtcnt + 100;
6610 result = _PyUnicode_New(reslen);
6611 if (result == NULL)
6612 goto onError;
6613 res = PyUnicode_AS_UNICODE(result);
6614
6615 if (PyTuple_Check(args)) {
6616 arglen = PyTuple_Size(args);
6617 argidx = 0;
6618 }
6619 else {
6620 arglen = -1;
6621 argidx = -2;
6622 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006623 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6624 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625 dict = args;
6626
6627 while (--fmtcnt >= 0) {
6628 if (*fmt != '%') {
6629 if (--rescnt < 0) {
6630 rescnt = fmtcnt + 100;
6631 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006632 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 return NULL;
6634 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6635 --rescnt;
6636 }
6637 *res++ = *fmt++;
6638 }
6639 else {
6640 /* Got a format specifier */
6641 int flags = 0;
6642 int width = -1;
6643 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 Py_UNICODE c = '\0';
6645 Py_UNICODE fill;
6646 PyObject *v = NULL;
6647 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006648 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649 Py_UNICODE sign;
6650 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006651 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652
6653 fmt++;
6654 if (*fmt == '(') {
6655 Py_UNICODE *keystart;
6656 int keylen;
6657 PyObject *key;
6658 int pcount = 1;
6659
6660 if (dict == NULL) {
6661 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006662 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 goto onError;
6664 }
6665 ++fmt;
6666 --fmtcnt;
6667 keystart = fmt;
6668 /* Skip over balanced parentheses */
6669 while (pcount > 0 && --fmtcnt >= 0) {
6670 if (*fmt == ')')
6671 --pcount;
6672 else if (*fmt == '(')
6673 ++pcount;
6674 fmt++;
6675 }
6676 keylen = fmt - keystart - 1;
6677 if (fmtcnt < 0 || pcount > 0) {
6678 PyErr_SetString(PyExc_ValueError,
6679 "incomplete format key");
6680 goto onError;
6681 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006682#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006683 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684 then looked up since Python uses strings to hold
6685 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006686 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687 key = PyUnicode_EncodeUTF8(keystart,
6688 keylen,
6689 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006690#else
6691 key = PyUnicode_FromUnicode(keystart, keylen);
6692#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693 if (key == NULL)
6694 goto onError;
6695 if (args_owned) {
6696 Py_DECREF(args);
6697 args_owned = 0;
6698 }
6699 args = PyObject_GetItem(dict, key);
6700 Py_DECREF(key);
6701 if (args == NULL) {
6702 goto onError;
6703 }
6704 args_owned = 1;
6705 arglen = -1;
6706 argidx = -2;
6707 }
6708 while (--fmtcnt >= 0) {
6709 switch (c = *fmt++) {
6710 case '-': flags |= F_LJUST; continue;
6711 case '+': flags |= F_SIGN; continue;
6712 case ' ': flags |= F_BLANK; continue;
6713 case '#': flags |= F_ALT; continue;
6714 case '0': flags |= F_ZERO; continue;
6715 }
6716 break;
6717 }
6718 if (c == '*') {
6719 v = getnextarg(args, arglen, &argidx);
6720 if (v == NULL)
6721 goto onError;
6722 if (!PyInt_Check(v)) {
6723 PyErr_SetString(PyExc_TypeError,
6724 "* wants int");
6725 goto onError;
6726 }
6727 width = PyInt_AsLong(v);
6728 if (width < 0) {
6729 flags |= F_LJUST;
6730 width = -width;
6731 }
6732 if (--fmtcnt >= 0)
6733 c = *fmt++;
6734 }
6735 else if (c >= '0' && c <= '9') {
6736 width = c - '0';
6737 while (--fmtcnt >= 0) {
6738 c = *fmt++;
6739 if (c < '0' || c > '9')
6740 break;
6741 if ((width*10) / 10 != width) {
6742 PyErr_SetString(PyExc_ValueError,
6743 "width too big");
6744 goto onError;
6745 }
6746 width = width*10 + (c - '0');
6747 }
6748 }
6749 if (c == '.') {
6750 prec = 0;
6751 if (--fmtcnt >= 0)
6752 c = *fmt++;
6753 if (c == '*') {
6754 v = getnextarg(args, arglen, &argidx);
6755 if (v == NULL)
6756 goto onError;
6757 if (!PyInt_Check(v)) {
6758 PyErr_SetString(PyExc_TypeError,
6759 "* wants int");
6760 goto onError;
6761 }
6762 prec = PyInt_AsLong(v);
6763 if (prec < 0)
6764 prec = 0;
6765 if (--fmtcnt >= 0)
6766 c = *fmt++;
6767 }
6768 else if (c >= '0' && c <= '9') {
6769 prec = c - '0';
6770 while (--fmtcnt >= 0) {
6771 c = Py_CHARMASK(*fmt++);
6772 if (c < '0' || c > '9')
6773 break;
6774 if ((prec*10) / 10 != prec) {
6775 PyErr_SetString(PyExc_ValueError,
6776 "prec too big");
6777 goto onError;
6778 }
6779 prec = prec*10 + (c - '0');
6780 }
6781 }
6782 } /* prec */
6783 if (fmtcnt >= 0) {
6784 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785 if (--fmtcnt >= 0)
6786 c = *fmt++;
6787 }
6788 }
6789 if (fmtcnt < 0) {
6790 PyErr_SetString(PyExc_ValueError,
6791 "incomplete format");
6792 goto onError;
6793 }
6794 if (c != '%') {
6795 v = getnextarg(args, arglen, &argidx);
6796 if (v == NULL)
6797 goto onError;
6798 }
6799 sign = 0;
6800 fill = ' ';
6801 switch (c) {
6802
6803 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006804 pbuf = formatbuf;
6805 /* presume that buffer length is at least 1 */
6806 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 len = 1;
6808 break;
6809
6810 case 's':
6811 case 'r':
6812 if (PyUnicode_Check(v) && c == 's') {
6813 temp = v;
6814 Py_INCREF(temp);
6815 }
6816 else {
6817 PyObject *unicode;
6818 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006819 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 else
6821 temp = PyObject_Repr(v);
6822 if (temp == NULL)
6823 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006824 if (PyUnicode_Check(temp))
6825 /* nothing to do */;
6826 else if (PyString_Check(temp)) {
6827 /* convert to string to Unicode */
Fred Drakee4315f52000-05-09 19:53:39 +00006828 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006830 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831 "strict");
6832 Py_DECREF(temp);
6833 temp = unicode;
6834 if (temp == NULL)
6835 goto onError;
6836 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006837 else {
6838 Py_DECREF(temp);
6839 PyErr_SetString(PyExc_TypeError,
6840 "%s argument has non-string str()");
6841 goto onError;
6842 }
6843 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006844 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845 len = PyUnicode_GET_SIZE(temp);
6846 if (prec >= 0 && len > prec)
6847 len = prec;
6848 break;
6849
6850 case 'i':
6851 case 'd':
6852 case 'u':
6853 case 'o':
6854 case 'x':
6855 case 'X':
6856 if (c == 'i')
6857 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006858 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006859 temp = formatlong(v, flags, prec, c);
6860 if (!temp)
6861 goto onError;
6862 pbuf = PyUnicode_AS_UNICODE(temp);
6863 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00006864 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006866 else {
6867 pbuf = formatbuf;
6868 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6869 flags, prec, c, v);
6870 if (len < 0)
6871 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006872 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006873 }
6874 if (flags & F_ZERO)
6875 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876 break;
6877
6878 case 'e':
6879 case 'E':
6880 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006881 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882 case 'g':
6883 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006884 if (c == 'F')
6885 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006886 pbuf = formatbuf;
6887 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6888 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 if (len < 0)
6890 goto onError;
6891 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006892 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 fill = '0';
6894 break;
6895
6896 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006897 pbuf = formatbuf;
6898 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899 if (len < 0)
6900 goto onError;
6901 break;
6902
6903 default:
6904 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006905 "unsupported format character '%c' (0x%x) "
6906 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00006907 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006908 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006909 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 goto onError;
6911 }
6912 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006913 if (*pbuf == '-' || *pbuf == '+') {
6914 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915 len--;
6916 }
6917 else if (flags & F_SIGN)
6918 sign = '+';
6919 else if (flags & F_BLANK)
6920 sign = ' ';
6921 else
6922 sign = 0;
6923 }
6924 if (width < len)
6925 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006926 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927 reslen -= rescnt;
6928 rescnt = width + fmtcnt + 100;
6929 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006930 if (reslen < 0) {
6931 Py_DECREF(result);
6932 return PyErr_NoMemory();
6933 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006934 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935 return NULL;
6936 res = PyUnicode_AS_UNICODE(result)
6937 + reslen - rescnt;
6938 }
6939 if (sign) {
6940 if (fill != ' ')
6941 *res++ = sign;
6942 rescnt--;
6943 if (width > len)
6944 width--;
6945 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006946 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6947 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006948 assert(pbuf[1] == c);
6949 if (fill != ' ') {
6950 *res++ = *pbuf++;
6951 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00006952 }
Tim Petersfff53252001-04-12 18:38:48 +00006953 rescnt -= 2;
6954 width -= 2;
6955 if (width < 0)
6956 width = 0;
6957 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00006958 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959 if (width > len && !(flags & F_LJUST)) {
6960 do {
6961 --rescnt;
6962 *res++ = fill;
6963 } while (--width > len);
6964 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006965 if (fill == ' ') {
6966 if (sign)
6967 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00006968 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006969 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006970 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00006971 *res++ = *pbuf++;
6972 *res++ = *pbuf++;
6973 }
6974 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006975 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976 res += len;
6977 rescnt -= len;
6978 while (--width >= len) {
6979 --rescnt;
6980 *res++ = ' ';
6981 }
6982 if (dict && (argidx < arglen) && c != '%') {
6983 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006984 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985 goto onError;
6986 }
6987 Py_XDECREF(temp);
6988 } /* '%' */
6989 } /* until end */
6990 if (argidx < arglen && !dict) {
6991 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006992 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993 goto onError;
6994 }
6995
6996 if (args_owned) {
6997 Py_DECREF(args);
6998 }
6999 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00007000 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007001 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 return (PyObject *)result;
7003
7004 onError:
7005 Py_XDECREF(result);
7006 Py_DECREF(uformat);
7007 if (args_owned) {
7008 Py_DECREF(args);
7009 }
7010 return NULL;
7011}
7012
7013static PyBufferProcs unicode_as_buffer = {
7014 (getreadbufferproc) unicode_buffer_getreadbuf,
7015 (getwritebufferproc) unicode_buffer_getwritebuf,
7016 (getsegcountproc) unicode_buffer_getsegcount,
7017 (getcharbufferproc) unicode_buffer_getcharbuf,
7018};
7019
Jeremy Hylton938ace62002-07-17 16:30:39 +00007020static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007021unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7022
Tim Peters6d6c1a32001-08-02 04:15:00 +00007023static PyObject *
7024unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7025{
7026 PyObject *x = NULL;
7027 static char *kwlist[] = {"string", "encoding", "errors", 0};
7028 char *encoding = NULL;
7029 char *errors = NULL;
7030
Guido van Rossume023fe02001-08-30 03:12:59 +00007031 if (type != &PyUnicode_Type)
7032 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007033 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7034 kwlist, &x, &encoding, &errors))
7035 return NULL;
7036 if (x == NULL)
7037 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007038 if (encoding == NULL && errors == NULL)
7039 return PyObject_Unicode(x);
7040 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007041 return PyUnicode_FromEncodedObject(x, encoding, errors);
7042}
7043
Guido van Rossume023fe02001-08-30 03:12:59 +00007044static PyObject *
7045unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7046{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007047 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007048 int n;
7049
7050 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7051 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7052 if (tmp == NULL)
7053 return NULL;
7054 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007055 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007056 if (pnew == NULL) {
7057 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007058 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007059 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007060 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7061 if (pnew->str == NULL) {
7062 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007063 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007064 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007065 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007066 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007067 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7068 pnew->length = n;
7069 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007070 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007071 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007072}
7073
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007074PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007075"unicode(string [, encoding[, errors]]) -> object\n\
7076\n\
7077Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007078encoding defaults to the current default string encoding.\n\
7079errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007080
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081PyTypeObject PyUnicode_Type = {
7082 PyObject_HEAD_INIT(&PyType_Type)
7083 0, /* ob_size */
7084 "unicode", /* tp_name */
7085 sizeof(PyUnicodeObject), /* tp_size */
7086 0, /* tp_itemsize */
7087 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007088 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007090 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091 0, /* tp_setattr */
7092 (cmpfunc) unicode_compare, /* tp_compare */
7093 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007094 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007096 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097 (hashfunc) unicode_hash, /* tp_hash*/
7098 0, /* tp_call*/
7099 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007100 PyObject_GenericGetAttr, /* tp_getattro */
7101 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007103 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7104 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007105 unicode_doc, /* tp_doc */
7106 0, /* tp_traverse */
7107 0, /* tp_clear */
7108 0, /* tp_richcompare */
7109 0, /* tp_weaklistoffset */
7110 0, /* tp_iter */
7111 0, /* tp_iternext */
7112 unicode_methods, /* tp_methods */
7113 0, /* tp_members */
7114 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007115 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007116 0, /* tp_dict */
7117 0, /* tp_descr_get */
7118 0, /* tp_descr_set */
7119 0, /* tp_dictoffset */
7120 0, /* tp_init */
7121 0, /* tp_alloc */
7122 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007123 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124};
7125
7126/* Initialize the Unicode implementation */
7127
Thomas Wouters78890102000-07-22 19:25:51 +00007128void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007130 int i;
7131
Fred Drakee4315f52000-05-09 19:53:39 +00007132 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007133 unicode_freelist = NULL;
7134 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007136 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007137 for (i = 0; i < 256; i++)
7138 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007139 if (PyType_Ready(&PyUnicode_Type) < 0)
7140 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141}
7142
7143/* Finalize the Unicode implementation */
7144
7145void
Thomas Wouters78890102000-07-22 19:25:51 +00007146_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007148 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007149 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007151 Py_XDECREF(unicode_empty);
7152 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007153
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007154 for (i = 0; i < 256; i++) {
7155 if (unicode_latin1[i]) {
7156 Py_DECREF(unicode_latin1[i]);
7157 unicode_latin1[i] = NULL;
7158 }
7159 }
7160
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007161 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162 PyUnicodeObject *v = u;
7163 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007164 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007165 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007166 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007167 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007169 unicode_freelist = NULL;
7170 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007172
7173/*
7174Local variables:
7175c-basic-offset: 4
7176indent-tabs-mode: nil
7177End:
7178*/