blob: ada01fc2937beed90ad8f2c7d67e70f086380ea3 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
157 unicode->length = length;
158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000266int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
371 for (i = size; i >= 0; i--)
372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
387 if (size > PyUnicode_GET_SIZE(unicode))
388 size = PyUnicode_GET_SIZE(unicode);
389#ifdef HAVE_USABLE_WCHAR_T
390 memcpy(w, unicode->str, size * sizeof(wchar_t));
391#else
392 {
393 register Py_UNICODE *u;
394 register int i;
395 u = PyUnicode_AS_UNICODE(unicode);
396 for (i = size; i >= 0; i--)
397 *w++ = *u++;
398 }
399#endif
400
401 return size;
402}
403
404#endif
405
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000406PyObject *PyUnicode_FromOrdinal(int ordinal)
407{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000408 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000409
410#ifdef Py_UNICODE_WIDE
411 if (ordinal < 0 || ordinal > 0x10ffff) {
412 PyErr_SetString(PyExc_ValueError,
413 "unichr() arg not in range(0x110000) "
414 "(wide Python build)");
415 return NULL;
416 }
417#else
418 if (ordinal < 0 || ordinal > 0xffff) {
419 PyErr_SetString(PyExc_ValueError,
420 "unichr() arg not in range(0x10000) "
421 "(narrow Python build)");
422 return NULL;
423 }
424#endif
425
Hye-Shik Chang40574832004-04-06 07:24:51 +0000426 s[0] = (Py_UNICODE)ordinal;
427 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000428}
429
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430PyObject *PyUnicode_FromObject(register PyObject *obj)
431{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000432 /* XXX Perhaps we should make this API an alias of
433 PyObject_Unicode() instead ?! */
434 if (PyUnicode_CheckExact(obj)) {
435 Py_INCREF(obj);
436 return obj;
437 }
438 if (PyUnicode_Check(obj)) {
439 /* For a Unicode subtype that's not a Unicode object,
440 return a true Unicode object with the same data. */
441 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
442 PyUnicode_GET_SIZE(obj));
443 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000444 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
445}
446
447PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
448 const char *encoding,
449 const char *errors)
450{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000451 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000453 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000454
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455 if (obj == NULL) {
456 PyErr_BadInternalCall();
457 return NULL;
458 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000460#if 0
461 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000462 that no encodings is given and then redirect to
463 PyObject_Unicode() which then applies the additional logic for
464 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000465
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000466 NOTE: This API should really only be used for object which
467 represent *encoded* Unicode !
468
469 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000470 if (PyUnicode_Check(obj)) {
471 if (encoding) {
472 PyErr_SetString(PyExc_TypeError,
473 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000474 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000475 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000476 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000477 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000478#else
479 if (PyUnicode_Check(obj)) {
480 PyErr_SetString(PyExc_TypeError,
481 "decoding Unicode is not supported");
482 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000484#endif
485
486 /* Coerce object */
487 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000488 s = PyString_AS_STRING(obj);
489 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000490 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
492 /* Overwrite the error message with something more useful in
493 case of a TypeError. */
494 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000495 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000496 "coercing to Unicode: need string or buffer, "
497 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000498 obj->ob_type->tp_name);
499 goto onError;
500 }
Tim Petersced69f82003-09-16 20:30:58 +0000501
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000502 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 if (len == 0) {
504 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000505 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000509
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000510 return v;
511
512 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514}
515
516PyObject *PyUnicode_Decode(const char *s,
517 int size,
518 const char *encoding,
519 const char *errors)
520{
521 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000522
523 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000524 encoding = PyUnicode_GetDefaultEncoding();
525
526 /* Shortcuts for common default encodings */
527 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000529 else if (strcmp(encoding, "latin-1") == 0)
530 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000531#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
532 else if (strcmp(encoding, "mbcs") == 0)
533 return PyUnicode_DecodeMBCS(s, size, errors);
534#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000535 else if (strcmp(encoding, "ascii") == 0)
536 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000537
538 /* Decode via the codec registry */
539 buffer = PyBuffer_FromMemory((void *)s, size);
540 if (buffer == NULL)
541 goto onError;
542 unicode = PyCodec_Decode(buffer, encoding, errors);
543 if (unicode == NULL)
544 goto onError;
545 if (!PyUnicode_Check(unicode)) {
546 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000547 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548 unicode->ob_type->tp_name);
549 Py_DECREF(unicode);
550 goto onError;
551 }
552 Py_DECREF(buffer);
553 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000554
Guido van Rossumd57fd912000-03-10 22:53:23 +0000555 onError:
556 Py_XDECREF(buffer);
557 return NULL;
558}
559
560PyObject *PyUnicode_Encode(const Py_UNICODE *s,
561 int size,
562 const char *encoding,
563 const char *errors)
564{
565 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000566
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567 unicode = PyUnicode_FromUnicode(s, size);
568 if (unicode == NULL)
569 return NULL;
570 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
571 Py_DECREF(unicode);
572 return v;
573}
574
575PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
576 const char *encoding,
577 const char *errors)
578{
579 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000580
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581 if (!PyUnicode_Check(unicode)) {
582 PyErr_BadArgument();
583 goto onError;
584 }
Fred Drakee4315f52000-05-09 19:53:39 +0000585
Tim Petersced69f82003-09-16 20:30:58 +0000586 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000587 encoding = PyUnicode_GetDefaultEncoding();
588
589 /* Shortcuts for common default encodings */
590 if (errors == NULL) {
591 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000592 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000593 else if (strcmp(encoding, "latin-1") == 0)
594 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000595#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
596 else if (strcmp(encoding, "mbcs") == 0)
597 return PyUnicode_AsMBCSString(unicode);
598#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000599 else if (strcmp(encoding, "ascii") == 0)
600 return PyUnicode_AsASCIIString(unicode);
601 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000602
603 /* Encode via the codec registry */
604 v = PyCodec_Encode(unicode, encoding, errors);
605 if (v == NULL)
606 goto onError;
607 /* XXX Should we really enforce this ? */
608 if (!PyString_Check(v)) {
609 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000610 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000611 v->ob_type->tp_name);
612 Py_DECREF(v);
613 goto onError;
614 }
615 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000616
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617 onError:
618 return NULL;
619}
620
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000621PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
622 const char *errors)
623{
624 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
625
626 if (v)
627 return v;
628 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
629 if (v && errors == NULL)
630 ((PyUnicodeObject *)unicode)->defenc = v;
631 return v;
632}
633
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
635{
636 if (!PyUnicode_Check(unicode)) {
637 PyErr_BadArgument();
638 goto onError;
639 }
640 return PyUnicode_AS_UNICODE(unicode);
641
642 onError:
643 return NULL;
644}
645
646int PyUnicode_GetSize(PyObject *unicode)
647{
648 if (!PyUnicode_Check(unicode)) {
649 PyErr_BadArgument();
650 goto onError;
651 }
652 return PyUnicode_GET_SIZE(unicode);
653
654 onError:
655 return -1;
656}
657
Thomas Wouters78890102000-07-22 19:25:51 +0000658const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000659{
660 return unicode_default_encoding;
661}
662
663int PyUnicode_SetDefaultEncoding(const char *encoding)
664{
665 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000666
Fred Drakee4315f52000-05-09 19:53:39 +0000667 /* Make sure the encoding is valid. As side effect, this also
668 loads the encoding into the codec registry cache. */
669 v = _PyCodec_Lookup(encoding);
670 if (v == NULL)
671 goto onError;
672 Py_DECREF(v);
673 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000674 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000675 sizeof(unicode_default_encoding));
676 return 0;
677
678 onError:
679 return -1;
680}
681
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000682/* error handling callback helper:
683 build arguments, call the callback and check the arguments,
684 if no exception occured, copy the replacement to the output
685 and adjust various state variables.
686 return 0 on success, -1 on error
687*/
688
689static
690int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
691 const char *encoding, const char *reason,
692 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
693 PyObject **output, int *outpos, Py_UNICODE **outptr)
694{
695 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
696
697 PyObject *restuple = NULL;
698 PyObject *repunicode = NULL;
699 int outsize = PyUnicode_GET_SIZE(*output);
700 int requiredsize;
701 int newpos;
702 Py_UNICODE *repptr;
703 int repsize;
704 int res = -1;
705
706 if (*errorHandler == NULL) {
707 *errorHandler = PyCodec_LookupError(errors);
708 if (*errorHandler == NULL)
709 goto onError;
710 }
711
712 if (*exceptionObject == NULL) {
713 *exceptionObject = PyUnicodeDecodeError_Create(
714 encoding, input, insize, *startinpos, *endinpos, reason);
715 if (*exceptionObject == NULL)
716 goto onError;
717 }
718 else {
719 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
720 goto onError;
721 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
722 goto onError;
723 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
724 goto onError;
725 }
726
727 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
728 if (restuple == NULL)
729 goto onError;
730 if (!PyTuple_Check(restuple)) {
731 PyErr_Format(PyExc_TypeError, &argparse[4]);
732 goto onError;
733 }
734 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
735 goto onError;
736 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000737 newpos = insize+newpos;
738 if (newpos<0 || newpos>insize) {
739 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
740 goto onError;
741 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000742
743 /* need more space? (at least enough for what we
744 have+the replacement+the rest of the string (starting
745 at the new input position), so we won't have to check space
746 when there are no errors in the rest of the string) */
747 repptr = PyUnicode_AS_UNICODE(repunicode);
748 repsize = PyUnicode_GET_SIZE(repunicode);
749 requiredsize = *outpos + repsize + insize-newpos;
750 if (requiredsize > outsize) {
751 if (requiredsize<2*outsize)
752 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000753 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000754 goto onError;
755 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
756 }
757 *endinpos = newpos;
758 *inptr = input + newpos;
759 Py_UNICODE_COPY(*outptr, repptr, repsize);
760 *outptr += repsize;
761 *outpos += repsize;
762 /* we made it! */
763 res = 0;
764
765 onError:
766 Py_XDECREF(restuple);
767 return res;
768}
769
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000770/* --- UTF-7 Codec -------------------------------------------------------- */
771
772/* see RFC2152 for details */
773
Tim Petersced69f82003-09-16 20:30:58 +0000774static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000775char utf7_special[128] = {
776 /* indicate whether a UTF-7 character is special i.e. cannot be directly
777 encoded:
778 0 - not special
779 1 - special
780 2 - whitespace (optional)
781 3 - RFC2152 Set O (optional) */
782 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
783 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
784 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
785 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
786 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
787 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
788 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
789 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
790
791};
792
793#define SPECIAL(c, encodeO, encodeWS) \
794 (((c)>127 || utf7_special[(c)] == 1) || \
795 (encodeWS && (utf7_special[(c)] == 2)) || \
796 (encodeO && (utf7_special[(c)] == 3)))
797
798#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
799#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
800#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
801 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
802
803#define ENCODE(out, ch, bits) \
804 while (bits >= 6) { \
805 *out++ = B64(ch >> (bits-6)); \
806 bits -= 6; \
807 }
808
809#define DECODE(out, ch, bits, surrogate) \
810 while (bits >= 16) { \
811 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
812 bits -= 16; \
813 if (surrogate) { \
814 /* We have already generated an error for the high surrogate
815 so let's not bother seeing if the low surrogate is correct or not */\
816 surrogate = 0; \
817 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
818 /* This is a surrogate pair. Unfortunately we can't represent \
819 it in a 16-bit character */ \
820 surrogate = 1; \
821 errmsg = "code pairs are not supported"; \
822 goto utf7Error; \
823 } else { \
824 *out++ = outCh; \
825 } \
826 } \
827
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000828PyObject *PyUnicode_DecodeUTF7(const char *s,
829 int size,
830 const char *errors)
831{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000832 const char *starts = s;
833 int startinpos;
834 int endinpos;
835 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000836 const char *e;
837 PyUnicodeObject *unicode;
838 Py_UNICODE *p;
839 const char *errmsg = "";
840 int inShift = 0;
841 unsigned int bitsleft = 0;
842 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000843 int surrogate = 0;
844 PyObject *errorHandler = NULL;
845 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000846
847 unicode = _PyUnicode_New(size);
848 if (!unicode)
849 return NULL;
850 if (size == 0)
851 return (PyObject *)unicode;
852
853 p = unicode->str;
854 e = s + size;
855
856 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000857 Py_UNICODE ch;
858 restart:
859 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000860
861 if (inShift) {
862 if ((ch == '-') || !B64CHAR(ch)) {
863 inShift = 0;
864 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000865
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000866 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
867 if (bitsleft >= 6) {
868 /* The shift sequence has a partial character in it. If
869 bitsleft < 6 then we could just classify it as padding
870 but that is not the case here */
871
872 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000873 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000874 }
875 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000876 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000877 here so indicate the potential of a misencoded character. */
878
879 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
880 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
881 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000882 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000883 }
884
885 if (ch == '-') {
886 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000887 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000888 inShift = 1;
889 }
890 } else if (SPECIAL(ch,0,0)) {
891 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000892 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000893 } else {
894 *p++ = ch;
895 }
896 } else {
897 charsleft = (charsleft << 6) | UB64(ch);
898 bitsleft += 6;
899 s++;
900 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
901 }
902 }
903 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000904 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000905 s++;
906 if (s < e && *s == '-') {
907 s++;
908 *p++ = '+';
909 } else
910 {
911 inShift = 1;
912 bitsleft = 0;
913 }
914 }
915 else if (SPECIAL(ch,0,0)) {
916 errmsg = "unexpected special character";
917 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000918 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000919 }
920 else {
921 *p++ = ch;
922 s++;
923 }
924 continue;
925 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000926 outpos = p-PyUnicode_AS_UNICODE(unicode);
927 endinpos = s-starts;
928 if (unicode_decode_call_errorhandler(
929 errors, &errorHandler,
930 "utf7", errmsg,
931 starts, size, &startinpos, &endinpos, &exc, &s,
932 (PyObject **)&unicode, &outpos, &p))
933 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000934 }
935
936 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000937 outpos = p-PyUnicode_AS_UNICODE(unicode);
938 endinpos = size;
939 if (unicode_decode_call_errorhandler(
940 errors, &errorHandler,
941 "utf7", "unterminated shift sequence",
942 starts, size, &startinpos, &endinpos, &exc, &s,
943 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000945 if (s < e)
946 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000947 }
948
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000949 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950 goto onError;
951
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000952 Py_XDECREF(errorHandler);
953 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000954 return (PyObject *)unicode;
955
956onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000957 Py_XDECREF(errorHandler);
958 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000959 Py_DECREF(unicode);
960 return NULL;
961}
962
963
964PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
965 int size,
966 int encodeSetO,
967 int encodeWhiteSpace,
968 const char *errors)
969{
970 PyObject *v;
971 /* It might be possible to tighten this worst case */
972 unsigned int cbAllocated = 5 * size;
973 int inShift = 0;
974 int i = 0;
975 unsigned int bitsleft = 0;
976 unsigned long charsleft = 0;
977 char * out;
978 char * start;
979
980 if (size == 0)
981 return PyString_FromStringAndSize(NULL, 0);
982
983 v = PyString_FromStringAndSize(NULL, cbAllocated);
984 if (v == NULL)
985 return NULL;
986
987 start = out = PyString_AS_STRING(v);
988 for (;i < size; ++i) {
989 Py_UNICODE ch = s[i];
990
991 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +0000992 if (ch == '+') {
993 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000994 *out++ = '-';
995 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
996 charsleft = ch;
997 bitsleft = 16;
998 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +0000999 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001000 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001001 } else {
1002 *out++ = (char) ch;
1003 }
1004 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001005 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1006 *out++ = B64(charsleft << (6-bitsleft));
1007 charsleft = 0;
1008 bitsleft = 0;
1009 /* Characters not in the BASE64 set implicitly unshift the sequence
1010 so no '-' is required, except if the character is itself a '-' */
1011 if (B64CHAR(ch) || ch == '-') {
1012 *out++ = '-';
1013 }
1014 inShift = 0;
1015 *out++ = (char) ch;
1016 } else {
1017 bitsleft += 16;
1018 charsleft = (charsleft << 16) | ch;
1019 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1020
1021 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001022 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001023 or '-' then the shift sequence will be terminated implicitly and we
1024 don't have to insert a '-'. */
1025
1026 if (bitsleft == 0) {
1027 if (i + 1 < size) {
1028 Py_UNICODE ch2 = s[i+1];
1029
1030 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001031
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001032 } else if (B64CHAR(ch2) || ch2 == '-') {
1033 *out++ = '-';
1034 inShift = 0;
1035 } else {
1036 inShift = 0;
1037 }
1038
1039 }
1040 else {
1041 *out++ = '-';
1042 inShift = 0;
1043 }
1044 }
Tim Petersced69f82003-09-16 20:30:58 +00001045 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001046 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001047 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001048 if (bitsleft) {
1049 *out++= B64(charsleft << (6-bitsleft) );
1050 *out++ = '-';
1051 }
1052
Tim Peters5de98422002-04-27 18:44:32 +00001053 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001054 return v;
1055}
1056
1057#undef SPECIAL
1058#undef B64
1059#undef B64CHAR
1060#undef UB64
1061#undef ENCODE
1062#undef DECODE
1063
Guido van Rossumd57fd912000-03-10 22:53:23 +00001064/* --- UTF-8 Codec -------------------------------------------------------- */
1065
Tim Petersced69f82003-09-16 20:30:58 +00001066static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067char utf8_code_length[256] = {
1068 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1069 illegal prefix. see RFC 2279 for details */
1070 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1071 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1072 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1073 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1074 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1075 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1076 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1077 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1078 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1079 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1080 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1081 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1082 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1083 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1084 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1085 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1086};
1087
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088PyObject *PyUnicode_DecodeUTF8(const char *s,
1089 int size,
1090 const char *errors)
1091{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001092 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001093 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001094 int startinpos;
1095 int endinpos;
1096 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097 const char *e;
1098 PyUnicodeObject *unicode;
1099 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001100 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001101 PyObject *errorHandler = NULL;
1102 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103
1104 /* Note: size will always be longer than the resulting Unicode
1105 character count */
1106 unicode = _PyUnicode_New(size);
1107 if (!unicode)
1108 return NULL;
1109 if (size == 0)
1110 return (PyObject *)unicode;
1111
1112 /* Unpack UTF-8 encoded data */
1113 p = unicode->str;
1114 e = s + size;
1115
1116 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001117 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001118
1119 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001120 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 s++;
1122 continue;
1123 }
1124
1125 n = utf8_code_length[ch];
1126
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001127 if (s + n > e) {
1128 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001129 startinpos = s-starts;
1130 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001131 goto utf8Error;
1132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001133
1134 switch (n) {
1135
1136 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001137 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001138 startinpos = s-starts;
1139 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001140 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001141
1142 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001143 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001144 startinpos = s-starts;
1145 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001146 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147
1148 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001149 if ((s[1] & 0xc0) != 0x80) {
1150 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001151 startinpos = s-starts;
1152 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001153 goto utf8Error;
1154 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001155 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001156 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001157 startinpos = s-starts;
1158 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001159 errmsg = "illegal encoding";
1160 goto utf8Error;
1161 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001163 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 break;
1165
1166 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001167 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001168 (s[2] & 0xc0) != 0x80) {
1169 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001170 startinpos = s-starts;
1171 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001172 goto utf8Error;
1173 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001175 if (ch < 0x0800) {
1176 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001177 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001178
1179 XXX For wide builds (UCS-4) we should probably try
1180 to recombine the surrogates into a single code
1181 unit.
1182 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001183 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001184 startinpos = s-starts;
1185 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001186 goto utf8Error;
1187 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001189 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001190 break;
1191
1192 case 4:
1193 if ((s[1] & 0xc0) != 0x80 ||
1194 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001195 (s[3] & 0xc0) != 0x80) {
1196 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001197 startinpos = s-starts;
1198 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001199 goto utf8Error;
1200 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001201 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1202 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1203 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001204 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001205 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001206 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001207 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001208 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001209 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001210 startinpos = s-starts;
1211 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001212 goto utf8Error;
1213 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001214#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001215 *p++ = (Py_UNICODE)ch;
1216#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001217 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001218
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001219 /* translate from 10000..10FFFF to 0..FFFF */
1220 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001221
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001222 /* high surrogate = top 10 bits added to D800 */
1223 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001224
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001225 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001226 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001227#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001228 break;
1229
1230 default:
1231 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001232 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001233 startinpos = s-starts;
1234 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001235 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236 }
1237 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001238 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001239
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001240 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001241 outpos = p-PyUnicode_AS_UNICODE(unicode);
1242 if (unicode_decode_call_errorhandler(
1243 errors, &errorHandler,
1244 "utf8", errmsg,
1245 starts, size, &startinpos, &endinpos, &exc, &s,
1246 (PyObject **)&unicode, &outpos, &p))
1247 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 }
1249
1250 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001251 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252 goto onError;
1253
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001254 Py_XDECREF(errorHandler);
1255 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 return (PyObject *)unicode;
1257
1258onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001259 Py_XDECREF(errorHandler);
1260 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001261 Py_DECREF(unicode);
1262 return NULL;
1263}
1264
Tim Peters602f7402002-04-27 18:03:26 +00001265/* Allocation strategy: if the string is short, convert into a stack buffer
1266 and allocate exactly as much space needed at the end. Else allocate the
1267 maximum possible needed (4 result bytes per Unicode character), and return
1268 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001269*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001270PyObject *
1271PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1272 int size,
1273 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274{
Tim Peters602f7402002-04-27 18:03:26 +00001275#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001276
Tim Peters602f7402002-04-27 18:03:26 +00001277 int i; /* index into s of next input byte */
1278 PyObject *v; /* result string object */
1279 char *p; /* next free byte in output buffer */
1280 int nallocated; /* number of result bytes allocated */
1281 int nneeded; /* number of result bytes needed */
1282 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001283
Tim Peters602f7402002-04-27 18:03:26 +00001284 assert(s != NULL);
1285 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286
Tim Peters602f7402002-04-27 18:03:26 +00001287 if (size <= MAX_SHORT_UNICHARS) {
1288 /* Write into the stack buffer; nallocated can't overflow.
1289 * At the end, we'll allocate exactly as much heap space as it
1290 * turns out we need.
1291 */
1292 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1293 v = NULL; /* will allocate after we're done */
1294 p = stackbuf;
1295 }
1296 else {
1297 /* Overallocate on the heap, and give the excess back at the end. */
1298 nallocated = size * 4;
1299 if (nallocated / 4 != size) /* overflow! */
1300 return PyErr_NoMemory();
1301 v = PyString_FromStringAndSize(NULL, nallocated);
1302 if (v == NULL)
1303 return NULL;
1304 p = PyString_AS_STRING(v);
1305 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001306
Tim Peters602f7402002-04-27 18:03:26 +00001307 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001308 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001309
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001310 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001311 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001313
Guido van Rossumd57fd912000-03-10 22:53:23 +00001314 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001315 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001316 *p++ = (char)(0xc0 | (ch >> 6));
1317 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001318 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001319 else {
Tim Peters602f7402002-04-27 18:03:26 +00001320 /* Encode UCS2 Unicode ordinals */
1321 if (ch < 0x10000) {
1322 /* Special case: check for high surrogate */
1323 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1324 Py_UCS4 ch2 = s[i];
1325 /* Check for low surrogate and combine the two to
1326 form a UCS4 value */
1327 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001328 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001329 i++;
1330 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001331 }
Tim Peters602f7402002-04-27 18:03:26 +00001332 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001333 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001334 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001335 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1336 *p++ = (char)(0x80 | (ch & 0x3f));
1337 continue;
1338 }
1339encodeUCS4:
1340 /* Encode UCS4 Unicode ordinals */
1341 *p++ = (char)(0xf0 | (ch >> 18));
1342 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1343 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1344 *p++ = (char)(0x80 | (ch & 0x3f));
1345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001347
Tim Peters602f7402002-04-27 18:03:26 +00001348 if (v == NULL) {
1349 /* This was stack allocated. */
1350 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1351 assert(nneeded <= nallocated);
1352 v = PyString_FromStringAndSize(stackbuf, nneeded);
1353 }
1354 else {
1355 /* Cut back to size actually needed. */
1356 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1357 assert(nneeded <= nallocated);
1358 _PyString_Resize(&v, nneeded);
1359 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001360 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001361
Tim Peters602f7402002-04-27 18:03:26 +00001362#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001363}
1364
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1366{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367 if (!PyUnicode_Check(unicode)) {
1368 PyErr_BadArgument();
1369 return NULL;
1370 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001371 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1372 PyUnicode_GET_SIZE(unicode),
1373 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001374}
1375
1376/* --- UTF-16 Codec ------------------------------------------------------- */
1377
Tim Peters772747b2001-08-09 22:21:55 +00001378PyObject *
1379PyUnicode_DecodeUTF16(const char *s,
1380 int size,
1381 const char *errors,
1382 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001383{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001384 const char *starts = s;
1385 int startinpos;
1386 int endinpos;
1387 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001388 PyUnicodeObject *unicode;
1389 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001390 const unsigned char *q, *e;
1391 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001392 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001393 /* Offsets from q for retrieving byte pairs in the right order. */
1394#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1395 int ihi = 1, ilo = 0;
1396#else
1397 int ihi = 0, ilo = 1;
1398#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001399 PyObject *errorHandler = NULL;
1400 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001401
1402 /* Note: size will always be longer than the resulting Unicode
1403 character count */
1404 unicode = _PyUnicode_New(size);
1405 if (!unicode)
1406 return NULL;
1407 if (size == 0)
1408 return (PyObject *)unicode;
1409
1410 /* Unpack UTF-16 encoded data */
1411 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001412 q = (unsigned char *)s;
1413 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001414
1415 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001416 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001418 /* Check for BOM marks (U+FEFF) in the input and adjust current
1419 byte order setting accordingly. In native mode, the leading BOM
1420 mark is skipped, in all other modes, it is copied to the output
1421 stream as-is (giving a ZWNBSP character). */
1422 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001423 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001424#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001425 if (bom == 0xFEFF) {
1426 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001427 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001428 }
1429 else if (bom == 0xFFFE) {
1430 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001431 bo = 1;
1432 }
Tim Petersced69f82003-09-16 20:30:58 +00001433#else
Tim Peters772747b2001-08-09 22:21:55 +00001434 if (bom == 0xFEFF) {
1435 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001436 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001437 }
1438 else if (bom == 0xFFFE) {
1439 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001440 bo = -1;
1441 }
1442#endif
1443 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444
Tim Peters772747b2001-08-09 22:21:55 +00001445 if (bo == -1) {
1446 /* force LE */
1447 ihi = 1;
1448 ilo = 0;
1449 }
1450 else if (bo == 1) {
1451 /* force BE */
1452 ihi = 0;
1453 ilo = 1;
1454 }
1455
1456 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001457 Py_UNICODE ch;
1458 /* remaing bytes at the end? (size should be even) */
1459 if (e-q<2) {
1460 errmsg = "truncated data";
1461 startinpos = ((const char *)q)-starts;
1462 endinpos = ((const char *)e)-starts;
1463 goto utf16Error;
1464 /* The remaining input chars are ignored if the callback
1465 chooses to skip the input */
1466 }
1467 ch = (q[ihi] << 8) | q[ilo];
1468
Tim Peters772747b2001-08-09 22:21:55 +00001469 q += 2;
1470
Guido van Rossumd57fd912000-03-10 22:53:23 +00001471 if (ch < 0xD800 || ch > 0xDFFF) {
1472 *p++ = ch;
1473 continue;
1474 }
1475
1476 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001477 if (q >= e) {
1478 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001479 startinpos = (((const char *)q)-2)-starts;
1480 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001481 goto utf16Error;
1482 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001483 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001484 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1485 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001486 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001487#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001488 *p++ = ch;
1489 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001490#else
1491 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001492#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001493 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001494 }
1495 else {
1496 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001497 startinpos = (((const char *)q)-4)-starts;
1498 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001499 goto utf16Error;
1500 }
1501
Guido van Rossumd57fd912000-03-10 22:53:23 +00001502 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001503 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001504 startinpos = (((const char *)q)-2)-starts;
1505 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001506 /* Fall through to report the error */
1507
1508 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001509 outpos = p-PyUnicode_AS_UNICODE(unicode);
1510 if (unicode_decode_call_errorhandler(
1511 errors, &errorHandler,
1512 "utf16", errmsg,
1513 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1514 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001515 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001516 }
1517
1518 if (byteorder)
1519 *byteorder = bo;
1520
1521 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001522 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001523 goto onError;
1524
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 Py_XDECREF(errorHandler);
1526 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001527 return (PyObject *)unicode;
1528
1529onError:
1530 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001531 Py_XDECREF(errorHandler);
1532 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001533 return NULL;
1534}
1535
Tim Peters772747b2001-08-09 22:21:55 +00001536PyObject *
1537PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1538 int size,
1539 const char *errors,
1540 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541{
1542 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001543 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001544#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001545 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001546#else
1547 const int pairs = 0;
1548#endif
Tim Peters772747b2001-08-09 22:21:55 +00001549 /* Offsets from p for storing byte pairs in the right order. */
1550#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1551 int ihi = 1, ilo = 0;
1552#else
1553 int ihi = 0, ilo = 1;
1554#endif
1555
1556#define STORECHAR(CH) \
1557 do { \
1558 p[ihi] = ((CH) >> 8) & 0xff; \
1559 p[ilo] = (CH) & 0xff; \
1560 p += 2; \
1561 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001563#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001564 for (i = pairs = 0; i < size; i++)
1565 if (s[i] >= 0x10000)
1566 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001567#endif
Tim Petersced69f82003-09-16 20:30:58 +00001568 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001569 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001570 if (v == NULL)
1571 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001572
Tim Peters772747b2001-08-09 22:21:55 +00001573 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001574 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001575 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001576 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001577 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001578
1579 if (byteorder == -1) {
1580 /* force LE */
1581 ihi = 1;
1582 ilo = 0;
1583 }
1584 else if (byteorder == 1) {
1585 /* force BE */
1586 ihi = 0;
1587 ilo = 1;
1588 }
1589
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001590 while (size-- > 0) {
1591 Py_UNICODE ch = *s++;
1592 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001593#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001594 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001595 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1596 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001597 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001598#endif
Tim Peters772747b2001-08-09 22:21:55 +00001599 STORECHAR(ch);
1600 if (ch2)
1601 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001603 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001604#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001605}
1606
1607PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1608{
1609 if (!PyUnicode_Check(unicode)) {
1610 PyErr_BadArgument();
1611 return NULL;
1612 }
1613 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1614 PyUnicode_GET_SIZE(unicode),
1615 NULL,
1616 0);
1617}
1618
1619/* --- Unicode Escape Codec ----------------------------------------------- */
1620
Fredrik Lundh06d12682001-01-24 07:59:11 +00001621static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001622
Guido van Rossumd57fd912000-03-10 22:53:23 +00001623PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1624 int size,
1625 const char *errors)
1626{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001627 const char *starts = s;
1628 int startinpos;
1629 int endinpos;
1630 int outpos;
1631 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001632 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001633 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001634 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001635 char* message;
1636 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001637 PyObject *errorHandler = NULL;
1638 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001639
Guido van Rossumd57fd912000-03-10 22:53:23 +00001640 /* Escaped strings will always be longer than the resulting
1641 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 length after conversion to the true value.
1643 (but if the error callback returns a long replacement string
1644 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001645 v = _PyUnicode_New(size);
1646 if (v == NULL)
1647 goto onError;
1648 if (size == 0)
1649 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001650
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001651 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001653
Guido van Rossumd57fd912000-03-10 22:53:23 +00001654 while (s < end) {
1655 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001656 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001657 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001658
1659 /* Non-escape characters are interpreted as Unicode ordinals */
1660 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001661 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001662 continue;
1663 }
1664
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001665 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001666 /* \ - Escapes */
1667 s++;
1668 switch (*s++) {
1669
1670 /* \x escapes */
1671 case '\n': break;
1672 case '\\': *p++ = '\\'; break;
1673 case '\'': *p++ = '\''; break;
1674 case '\"': *p++ = '\"'; break;
1675 case 'b': *p++ = '\b'; break;
1676 case 'f': *p++ = '\014'; break; /* FF */
1677 case 't': *p++ = '\t'; break;
1678 case 'n': *p++ = '\n'; break;
1679 case 'r': *p++ = '\r'; break;
1680 case 'v': *p++ = '\013'; break; /* VT */
1681 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1682
1683 /* \OOO (octal) escapes */
1684 case '0': case '1': case '2': case '3':
1685 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001686 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001688 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001689 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001690 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001691 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001692 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693 break;
1694
Fredrik Lundhccc74732001-02-18 22:13:49 +00001695 /* hex escapes */
1696 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001697 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001698 digits = 2;
1699 message = "truncated \\xXX escape";
1700 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001701
Fredrik Lundhccc74732001-02-18 22:13:49 +00001702 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001703 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001704 digits = 4;
1705 message = "truncated \\uXXXX escape";
1706 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707
Fredrik Lundhccc74732001-02-18 22:13:49 +00001708 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001709 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001710 digits = 8;
1711 message = "truncated \\UXXXXXXXX escape";
1712 hexescape:
1713 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001714 outpos = p-PyUnicode_AS_UNICODE(v);
1715 if (s+digits>end) {
1716 endinpos = size;
1717 if (unicode_decode_call_errorhandler(
1718 errors, &errorHandler,
1719 "unicodeescape", "end of string in escape sequence",
1720 starts, size, &startinpos, &endinpos, &exc, &s,
1721 (PyObject **)&v, &outpos, &p))
1722 goto onError;
1723 goto nextByte;
1724 }
1725 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001726 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001727 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728 endinpos = (s+i+1)-starts;
1729 if (unicode_decode_call_errorhandler(
1730 errors, &errorHandler,
1731 "unicodeescape", message,
1732 starts, size, &startinpos, &endinpos, &exc, &s,
1733 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001734 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001735 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001736 }
1737 chr = (chr<<4) & ~0xF;
1738 if (c >= '0' && c <= '9')
1739 chr += c - '0';
1740 else if (c >= 'a' && c <= 'f')
1741 chr += 10 + c - 'a';
1742 else
1743 chr += 10 + c - 'A';
1744 }
1745 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001746 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001747 /* _decoding_error will have already written into the
1748 target buffer. */
1749 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001750 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001751 /* when we get here, chr is a 32-bit unicode character */
1752 if (chr <= 0xffff)
1753 /* UCS-2 character */
1754 *p++ = (Py_UNICODE) chr;
1755 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001756 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001757 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001758#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001759 *p++ = chr;
1760#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001761 chr -= 0x10000L;
1762 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001763 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001764#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001765 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001766 endinpos = s-starts;
1767 outpos = p-PyUnicode_AS_UNICODE(v);
1768 if (unicode_decode_call_errorhandler(
1769 errors, &errorHandler,
1770 "unicodeescape", "illegal Unicode character",
1771 starts, size, &startinpos, &endinpos, &exc, &s,
1772 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001773 goto onError;
1774 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001775 break;
1776
1777 /* \N{name} */
1778 case 'N':
1779 message = "malformed \\N character escape";
1780 if (ucnhash_CAPI == NULL) {
1781 /* load the unicode data module */
1782 PyObject *m, *v;
1783 m = PyImport_ImportModule("unicodedata");
1784 if (m == NULL)
1785 goto ucnhashError;
1786 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1787 Py_DECREF(m);
1788 if (v == NULL)
1789 goto ucnhashError;
1790 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1791 Py_DECREF(v);
1792 if (ucnhash_CAPI == NULL)
1793 goto ucnhashError;
1794 }
1795 if (*s == '{') {
1796 const char *start = s+1;
1797 /* look for the closing brace */
1798 while (*s != '}' && s < end)
1799 s++;
1800 if (s > start && s < end && *s == '}') {
1801 /* found a name. look it up in the unicode database */
1802 message = "unknown Unicode character name";
1803 s++;
1804 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1805 goto store;
1806 }
1807 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001808 endinpos = s-starts;
1809 outpos = p-PyUnicode_AS_UNICODE(v);
1810 if (unicode_decode_call_errorhandler(
1811 errors, &errorHandler,
1812 "unicodeescape", message,
1813 starts, size, &startinpos, &endinpos, &exc, &s,
1814 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001815 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001816 break;
1817
1818 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001819 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001820 message = "\\ at end of string";
1821 s--;
1822 endinpos = s-starts;
1823 outpos = p-PyUnicode_AS_UNICODE(v);
1824 if (unicode_decode_call_errorhandler(
1825 errors, &errorHandler,
1826 "unicodeescape", message,
1827 starts, size, &startinpos, &endinpos, &exc, &s,
1828 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001829 goto onError;
1830 }
1831 else {
1832 *p++ = '\\';
1833 *p++ = (unsigned char)s[-1];
1834 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001835 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001836 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001837 nextByte:
1838 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001840 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001841 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001842 Py_XDECREF(errorHandler);
1843 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001845
Fredrik Lundhccc74732001-02-18 22:13:49 +00001846ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001847 PyErr_SetString(
1848 PyExc_UnicodeError,
1849 "\\N escapes not supported (can't load unicodedata module)"
1850 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001851 Py_XDECREF(errorHandler);
1852 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001853 return NULL;
1854
Fredrik Lundhccc74732001-02-18 22:13:49 +00001855onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001857 Py_XDECREF(errorHandler);
1858 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001859 return NULL;
1860}
1861
1862/* Return a Unicode-Escape string version of the Unicode object.
1863
1864 If quotes is true, the string is enclosed in u"" or u'' quotes as
1865 appropriate.
1866
1867*/
1868
Barry Warsaw51ac5802000-03-20 16:36:48 +00001869static const Py_UNICODE *findchar(const Py_UNICODE *s,
1870 int size,
1871 Py_UNICODE ch);
1872
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873static
1874PyObject *unicodeescape_string(const Py_UNICODE *s,
1875 int size,
1876 int quotes)
1877{
1878 PyObject *repr;
1879 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001881 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001882
1883 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1884 if (repr == NULL)
1885 return NULL;
1886
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001887 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001888
1889 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001890 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001891 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001892 !findchar(s, size, '"')) ? '"' : '\'';
1893 }
1894 while (size-- > 0) {
1895 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001896
Guido van Rossumd57fd912000-03-10 22:53:23 +00001897 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001898 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001899 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001900 *p++ = '\\';
1901 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001902 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001903 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001904
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001905#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001906 /* Map 21-bit characters to '\U00xxxxxx' */
1907 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001908 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00001909
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001910 /* Resize the string if necessary */
1911 if (offset + 12 > PyString_GET_SIZE(repr)) {
1912 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001913 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001914 p = PyString_AS_STRING(repr) + offset;
1915 }
1916
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001917 *p++ = '\\';
1918 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001919 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1920 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1921 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1922 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1923 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1924 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1925 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001926 *p++ = hexdigit[ch & 0x0000000F];
1927 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001928 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001929#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001930 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1931 else if (ch >= 0xD800 && ch < 0xDC00) {
1932 Py_UNICODE ch2;
1933 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00001934
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001935 ch2 = *s++;
1936 size--;
1937 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1938 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1939 *p++ = '\\';
1940 *p++ = 'U';
1941 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1942 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1943 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1944 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1945 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1946 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1947 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1948 *p++ = hexdigit[ucs & 0x0000000F];
1949 continue;
1950 }
1951 /* Fall through: isolated surrogates are copied as-is */
1952 s--;
1953 size++;
1954 }
1955
Guido van Rossumd57fd912000-03-10 22:53:23 +00001956 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001957 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001958 *p++ = '\\';
1959 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001960 *p++ = hexdigit[(ch >> 12) & 0x000F];
1961 *p++ = hexdigit[(ch >> 8) & 0x000F];
1962 *p++ = hexdigit[(ch >> 4) & 0x000F];
1963 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001964 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001965
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001966 /* Map special whitespace to '\t', \n', '\r' */
1967 else if (ch == '\t') {
1968 *p++ = '\\';
1969 *p++ = 't';
1970 }
1971 else if (ch == '\n') {
1972 *p++ = '\\';
1973 *p++ = 'n';
1974 }
1975 else if (ch == '\r') {
1976 *p++ = '\\';
1977 *p++ = 'r';
1978 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001979
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001980 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001981 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001983 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001984 *p++ = hexdigit[(ch >> 4) & 0x000F];
1985 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00001986 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001987
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988 /* Copy everything else as-is */
1989 else
1990 *p++ = (char) ch;
1991 }
1992 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001993 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994
1995 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00001996 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997 return repr;
1998}
1999
2000PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2001 int size)
2002{
2003 return unicodeescape_string(s, size, 0);
2004}
2005
2006PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2007{
2008 if (!PyUnicode_Check(unicode)) {
2009 PyErr_BadArgument();
2010 return NULL;
2011 }
2012 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2013 PyUnicode_GET_SIZE(unicode));
2014}
2015
2016/* --- Raw Unicode Escape Codec ------------------------------------------- */
2017
2018PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2019 int size,
2020 const char *errors)
2021{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002022 const char *starts = s;
2023 int startinpos;
2024 int endinpos;
2025 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002026 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002027 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002028 const char *end;
2029 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002030 PyObject *errorHandler = NULL;
2031 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002032
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033 /* Escaped strings will always be longer than the resulting
2034 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002035 length after conversion to the true value. (But decoding error
2036 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037 v = _PyUnicode_New(size);
2038 if (v == NULL)
2039 goto onError;
2040 if (size == 0)
2041 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002042 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043 end = s + size;
2044 while (s < end) {
2045 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002046 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002048 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049
2050 /* Non-escape characters are interpreted as Unicode ordinals */
2051 if (*s != '\\') {
2052 *p++ = (unsigned char)*s++;
2053 continue;
2054 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002055 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056
2057 /* \u-escapes are only interpreted iff the number of leading
2058 backslashes if odd */
2059 bs = s;
2060 for (;s < end;) {
2061 if (*s != '\\')
2062 break;
2063 *p++ = (unsigned char)*s++;
2064 }
2065 if (((s - bs) & 1) == 0 ||
2066 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002067 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 continue;
2069 }
2070 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002071 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 s++;
2073
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002074 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002075 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002076 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002077 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002079 endinpos = s-starts;
2080 if (unicode_decode_call_errorhandler(
2081 errors, &errorHandler,
2082 "rawunicodeescape", "truncated \\uXXXX",
2083 starts, size, &startinpos, &endinpos, &exc, &s,
2084 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002086 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087 }
2088 x = (x<<4) & ~0xF;
2089 if (c >= '0' && c <= '9')
2090 x += c - '0';
2091 else if (c >= 'a' && c <= 'f')
2092 x += 10 + c - 'a';
2093 else
2094 x += 10 + c - 'A';
2095 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002096#ifndef Py_UNICODE_WIDE
2097 if (x > 0x10000) {
2098 if (unicode_decode_call_errorhandler(
2099 errors, &errorHandler,
2100 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2101 starts, size, &startinpos, &endinpos, &exc, &s,
2102 (PyObject **)&v, &outpos, &p))
2103 goto onError;
2104 }
2105#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002106 *p++ = x;
2107 nextByte:
2108 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002110 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002111 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002112 Py_XDECREF(errorHandler);
2113 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002114 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002115
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 onError:
2117 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002118 Py_XDECREF(errorHandler);
2119 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002120 return NULL;
2121}
2122
2123PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2124 int size)
2125{
2126 PyObject *repr;
2127 char *p;
2128 char *q;
2129
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002130 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002132#ifdef Py_UNICODE_WIDE
2133 repr = PyString_FromStringAndSize(NULL, 10 * size);
2134#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002136#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002137 if (repr == NULL)
2138 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002139 if (size == 0)
2140 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002141
2142 p = q = PyString_AS_STRING(repr);
2143 while (size-- > 0) {
2144 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002145#ifdef Py_UNICODE_WIDE
2146 /* Map 32-bit characters to '\Uxxxxxxxx' */
2147 if (ch >= 0x10000) {
2148 *p++ = '\\';
2149 *p++ = 'U';
2150 *p++ = hexdigit[(ch >> 28) & 0xf];
2151 *p++ = hexdigit[(ch >> 24) & 0xf];
2152 *p++ = hexdigit[(ch >> 20) & 0xf];
2153 *p++ = hexdigit[(ch >> 16) & 0xf];
2154 *p++ = hexdigit[(ch >> 12) & 0xf];
2155 *p++ = hexdigit[(ch >> 8) & 0xf];
2156 *p++ = hexdigit[(ch >> 4) & 0xf];
2157 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002158 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002159 else
2160#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161 /* Map 16-bit characters to '\uxxxx' */
2162 if (ch >= 256) {
2163 *p++ = '\\';
2164 *p++ = 'u';
2165 *p++ = hexdigit[(ch >> 12) & 0xf];
2166 *p++ = hexdigit[(ch >> 8) & 0xf];
2167 *p++ = hexdigit[(ch >> 4) & 0xf];
2168 *p++ = hexdigit[ch & 15];
2169 }
2170 /* Copy everything else as-is */
2171 else
2172 *p++ = (char) ch;
2173 }
2174 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002175 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176 return repr;
2177}
2178
2179PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2180{
2181 if (!PyUnicode_Check(unicode)) {
2182 PyErr_BadArgument();
2183 return NULL;
2184 }
2185 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2186 PyUnicode_GET_SIZE(unicode));
2187}
2188
2189/* --- Latin-1 Codec ------------------------------------------------------ */
2190
2191PyObject *PyUnicode_DecodeLatin1(const char *s,
2192 int size,
2193 const char *errors)
2194{
2195 PyUnicodeObject *v;
2196 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002197
Guido van Rossumd57fd912000-03-10 22:53:23 +00002198 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002199 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002200 Py_UNICODE r = *(unsigned char*)s;
2201 return PyUnicode_FromUnicode(&r, 1);
2202 }
2203
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204 v = _PyUnicode_New(size);
2205 if (v == NULL)
2206 goto onError;
2207 if (size == 0)
2208 return (PyObject *)v;
2209 p = PyUnicode_AS_UNICODE(v);
2210 while (size-- > 0)
2211 *p++ = (unsigned char)*s++;
2212 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002213
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214 onError:
2215 Py_XDECREF(v);
2216 return NULL;
2217}
2218
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002219/* create or adjust a UnicodeEncodeError */
2220static void make_encode_exception(PyObject **exceptionObject,
2221 const char *encoding,
2222 const Py_UNICODE *unicode, int size,
2223 int startpos, int endpos,
2224 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002225{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002226 if (*exceptionObject == NULL) {
2227 *exceptionObject = PyUnicodeEncodeError_Create(
2228 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002229 }
2230 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002231 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2232 goto onError;
2233 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2234 goto onError;
2235 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2236 goto onError;
2237 return;
2238 onError:
2239 Py_DECREF(*exceptionObject);
2240 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241 }
2242}
2243
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002244/* raises a UnicodeEncodeError */
2245static void raise_encode_exception(PyObject **exceptionObject,
2246 const char *encoding,
2247 const Py_UNICODE *unicode, int size,
2248 int startpos, int endpos,
2249 const char *reason)
2250{
2251 make_encode_exception(exceptionObject,
2252 encoding, unicode, size, startpos, endpos, reason);
2253 if (*exceptionObject != NULL)
2254 PyCodec_StrictErrors(*exceptionObject);
2255}
2256
2257/* error handling callback helper:
2258 build arguments, call the callback and check the arguments,
2259 put the result into newpos and return the replacement string, which
2260 has to be freed by the caller */
2261static PyObject *unicode_encode_call_errorhandler(const char *errors,
2262 PyObject **errorHandler,
2263 const char *encoding, const char *reason,
2264 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2265 int startpos, int endpos,
2266 int *newpos)
2267{
2268 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2269
2270 PyObject *restuple;
2271 PyObject *resunicode;
2272
2273 if (*errorHandler == NULL) {
2274 *errorHandler = PyCodec_LookupError(errors);
2275 if (*errorHandler == NULL)
2276 return NULL;
2277 }
2278
2279 make_encode_exception(exceptionObject,
2280 encoding, unicode, size, startpos, endpos, reason);
2281 if (*exceptionObject == NULL)
2282 return NULL;
2283
2284 restuple = PyObject_CallFunctionObjArgs(
2285 *errorHandler, *exceptionObject, NULL);
2286 if (restuple == NULL)
2287 return NULL;
2288 if (!PyTuple_Check(restuple)) {
2289 PyErr_Format(PyExc_TypeError, &argparse[4]);
2290 Py_DECREF(restuple);
2291 return NULL;
2292 }
2293 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2294 &resunicode, newpos)) {
2295 Py_DECREF(restuple);
2296 return NULL;
2297 }
2298 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002299 *newpos = size+*newpos;
2300 if (*newpos<0 || *newpos>size) {
2301 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2302 Py_DECREF(restuple);
2303 return NULL;
2304 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002305 Py_INCREF(resunicode);
2306 Py_DECREF(restuple);
2307 return resunicode;
2308}
2309
2310static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2311 int size,
2312 const char *errors,
2313 int limit)
2314{
2315 /* output object */
2316 PyObject *res;
2317 /* pointers to the beginning and end+1 of input */
2318 const Py_UNICODE *startp = p;
2319 const Py_UNICODE *endp = p + size;
2320 /* pointer to the beginning of the unencodable characters */
2321 /* const Py_UNICODE *badp = NULL; */
2322 /* pointer into the output */
2323 char *str;
2324 /* current output position */
2325 int respos = 0;
2326 int ressize;
2327 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2328 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2329 PyObject *errorHandler = NULL;
2330 PyObject *exc = NULL;
2331 /* the following variable is used for caching string comparisons
2332 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2333 int known_errorHandler = -1;
2334
2335 /* allocate enough for a simple encoding without
2336 replacements, if we need more, we'll resize */
2337 res = PyString_FromStringAndSize(NULL, size);
2338 if (res == NULL)
2339 goto onError;
2340 if (size == 0)
2341 return res;
2342 str = PyString_AS_STRING(res);
2343 ressize = size;
2344
2345 while (p<endp) {
2346 Py_UNICODE c = *p;
2347
2348 /* can we encode this? */
2349 if (c<limit) {
2350 /* no overflow check, because we know that the space is enough */
2351 *str++ = (char)c;
2352 ++p;
2353 }
2354 else {
2355 int unicodepos = p-startp;
2356 int requiredsize;
2357 PyObject *repunicode;
2358 int repsize;
2359 int newpos;
2360 int respos;
2361 Py_UNICODE *uni2;
2362 /* startpos for collecting unencodable chars */
2363 const Py_UNICODE *collstart = p;
2364 const Py_UNICODE *collend = p;
2365 /* find all unecodable characters */
2366 while ((collend < endp) && ((*collend)>=limit))
2367 ++collend;
2368 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2369 if (known_errorHandler==-1) {
2370 if ((errors==NULL) || (!strcmp(errors, "strict")))
2371 known_errorHandler = 1;
2372 else if (!strcmp(errors, "replace"))
2373 known_errorHandler = 2;
2374 else if (!strcmp(errors, "ignore"))
2375 known_errorHandler = 3;
2376 else if (!strcmp(errors, "xmlcharrefreplace"))
2377 known_errorHandler = 4;
2378 else
2379 known_errorHandler = 0;
2380 }
2381 switch (known_errorHandler) {
2382 case 1: /* strict */
2383 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2384 goto onError;
2385 case 2: /* replace */
2386 while (collstart++<collend)
2387 *str++ = '?'; /* fall through */
2388 case 3: /* ignore */
2389 p = collend;
2390 break;
2391 case 4: /* xmlcharrefreplace */
2392 respos = str-PyString_AS_STRING(res);
2393 /* determine replacement size (temporarily (mis)uses p) */
2394 for (p = collstart, repsize = 0; p < collend; ++p) {
2395 if (*p<10)
2396 repsize += 2+1+1;
2397 else if (*p<100)
2398 repsize += 2+2+1;
2399 else if (*p<1000)
2400 repsize += 2+3+1;
2401 else if (*p<10000)
2402 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002403#ifndef Py_UNICODE_WIDE
2404 else
2405 repsize += 2+5+1;
2406#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002407 else if (*p<100000)
2408 repsize += 2+5+1;
2409 else if (*p<1000000)
2410 repsize += 2+6+1;
2411 else
2412 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002413#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002414 }
2415 requiredsize = respos+repsize+(endp-collend);
2416 if (requiredsize > ressize) {
2417 if (requiredsize<2*ressize)
2418 requiredsize = 2*ressize;
2419 if (_PyString_Resize(&res, requiredsize))
2420 goto onError;
2421 str = PyString_AS_STRING(res) + respos;
2422 ressize = requiredsize;
2423 }
2424 /* generate replacement (temporarily (mis)uses p) */
2425 for (p = collstart; p < collend; ++p) {
2426 str += sprintf(str, "&#%d;", (int)*p);
2427 }
2428 p = collend;
2429 break;
2430 default:
2431 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2432 encoding, reason, startp, size, &exc,
2433 collstart-startp, collend-startp, &newpos);
2434 if (repunicode == NULL)
2435 goto onError;
2436 /* need more space? (at least enough for what we
2437 have+the replacement+the rest of the string, so
2438 we won't have to check space for encodable characters) */
2439 respos = str-PyString_AS_STRING(res);
2440 repsize = PyUnicode_GET_SIZE(repunicode);
2441 requiredsize = respos+repsize+(endp-collend);
2442 if (requiredsize > ressize) {
2443 if (requiredsize<2*ressize)
2444 requiredsize = 2*ressize;
2445 if (_PyString_Resize(&res, requiredsize)) {
2446 Py_DECREF(repunicode);
2447 goto onError;
2448 }
2449 str = PyString_AS_STRING(res) + respos;
2450 ressize = requiredsize;
2451 }
2452 /* check if there is anything unencodable in the replacement
2453 and copy it to the output */
2454 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2455 c = *uni2;
2456 if (c >= limit) {
2457 raise_encode_exception(&exc, encoding, startp, size,
2458 unicodepos, unicodepos+1, reason);
2459 Py_DECREF(repunicode);
2460 goto onError;
2461 }
2462 *str = (char)c;
2463 }
2464 p = startp + newpos;
2465 Py_DECREF(repunicode);
2466 }
2467 }
2468 }
2469 /* Resize if we allocated to much */
2470 respos = str-PyString_AS_STRING(res);
2471 if (respos<ressize)
2472 /* If this falls res will be NULL */
2473 _PyString_Resize(&res, respos);
2474 Py_XDECREF(errorHandler);
2475 Py_XDECREF(exc);
2476 return res;
2477
2478 onError:
2479 Py_XDECREF(res);
2480 Py_XDECREF(errorHandler);
2481 Py_XDECREF(exc);
2482 return NULL;
2483}
2484
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2486 int size,
2487 const char *errors)
2488{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002489 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490}
2491
2492PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2493{
2494 if (!PyUnicode_Check(unicode)) {
2495 PyErr_BadArgument();
2496 return NULL;
2497 }
2498 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2499 PyUnicode_GET_SIZE(unicode),
2500 NULL);
2501}
2502
2503/* --- 7-bit ASCII Codec -------------------------------------------------- */
2504
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505PyObject *PyUnicode_DecodeASCII(const char *s,
2506 int size,
2507 const char *errors)
2508{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002509 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510 PyUnicodeObject *v;
2511 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002512 int startinpos;
2513 int endinpos;
2514 int outpos;
2515 const char *e;
2516 PyObject *errorHandler = NULL;
2517 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002518
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002520 if (size == 1 && *(unsigned char*)s < 128) {
2521 Py_UNICODE r = *(unsigned char*)s;
2522 return PyUnicode_FromUnicode(&r, 1);
2523 }
Tim Petersced69f82003-09-16 20:30:58 +00002524
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525 v = _PyUnicode_New(size);
2526 if (v == NULL)
2527 goto onError;
2528 if (size == 0)
2529 return (PyObject *)v;
2530 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002531 e = s + size;
2532 while (s < e) {
2533 register unsigned char c = (unsigned char)*s;
2534 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002536 ++s;
2537 }
2538 else {
2539 startinpos = s-starts;
2540 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002541 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002542 if (unicode_decode_call_errorhandler(
2543 errors, &errorHandler,
2544 "ascii", "ordinal not in range(128)",
2545 starts, size, &startinpos, &endinpos, &exc, &s,
2546 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002548 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002550 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002551 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002552 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002553 Py_XDECREF(errorHandler);
2554 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002556
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 onError:
2558 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002559 Py_XDECREF(errorHandler);
2560 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561 return NULL;
2562}
2563
Guido van Rossumd57fd912000-03-10 22:53:23 +00002564PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2565 int size,
2566 const char *errors)
2567{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002568 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002569}
2570
2571PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2572{
2573 if (!PyUnicode_Check(unicode)) {
2574 PyErr_BadArgument();
2575 return NULL;
2576 }
2577 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2578 PyUnicode_GET_SIZE(unicode),
2579 NULL);
2580}
2581
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002582#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002583
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002584/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002585
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002586PyObject *PyUnicode_DecodeMBCS(const char *s,
2587 int size,
2588 const char *errors)
2589{
2590 PyUnicodeObject *v;
2591 Py_UNICODE *p;
2592
2593 /* First get the size of the result */
2594 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002595 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002596 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2597
2598 v = _PyUnicode_New(usize);
2599 if (v == NULL)
2600 return NULL;
2601 if (usize == 0)
2602 return (PyObject *)v;
2603 p = PyUnicode_AS_UNICODE(v);
2604 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2605 Py_DECREF(v);
2606 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2607 }
2608
2609 return (PyObject *)v;
2610}
2611
2612PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2613 int size,
2614 const char *errors)
2615{
2616 PyObject *repr;
2617 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002618 DWORD mbcssize;
2619
2620 /* If there are no characters, bail now! */
2621 if (size==0)
2622 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002623
2624 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002625 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002626 if (mbcssize==0)
2627 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2628
2629 repr = PyString_FromStringAndSize(NULL, mbcssize);
2630 if (repr == NULL)
2631 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002632 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002633 return repr;
2634
2635 /* Do the conversion */
2636 s = PyString_AS_STRING(repr);
2637 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2638 Py_DECREF(repr);
2639 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2640 }
2641 return repr;
2642}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002643
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002644PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2645{
2646 if (!PyUnicode_Check(unicode)) {
2647 PyErr_BadArgument();
2648 return NULL;
2649 }
2650 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2651 PyUnicode_GET_SIZE(unicode),
2652 NULL);
2653}
2654
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002655#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002656
Guido van Rossumd57fd912000-03-10 22:53:23 +00002657/* --- Character Mapping Codec -------------------------------------------- */
2658
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659PyObject *PyUnicode_DecodeCharmap(const char *s,
2660 int size,
2661 PyObject *mapping,
2662 const char *errors)
2663{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002664 const char *starts = s;
2665 int startinpos;
2666 int endinpos;
2667 int outpos;
2668 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669 PyUnicodeObject *v;
2670 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002671 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002672 PyObject *errorHandler = NULL;
2673 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002674
Guido van Rossumd57fd912000-03-10 22:53:23 +00002675 /* Default to Latin-1 */
2676 if (mapping == NULL)
2677 return PyUnicode_DecodeLatin1(s, size, errors);
2678
2679 v = _PyUnicode_New(size);
2680 if (v == NULL)
2681 goto onError;
2682 if (size == 0)
2683 return (PyObject *)v;
2684 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002685 e = s + size;
2686 while (s < e) {
2687 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002688 PyObject *w, *x;
2689
2690 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2691 w = PyInt_FromLong((long)ch);
2692 if (w == NULL)
2693 goto onError;
2694 x = PyObject_GetItem(mapping, w);
2695 Py_DECREF(w);
2696 if (x == NULL) {
2697 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002698 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002700 x = Py_None;
2701 Py_INCREF(x);
2702 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002703 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704 }
2705
2706 /* Apply mapping */
2707 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002708 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709 if (value < 0 || value > 65535) {
2710 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002711 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712 Py_DECREF(x);
2713 goto onError;
2714 }
2715 *p++ = (Py_UNICODE)value;
2716 }
2717 else if (x == Py_None) {
2718 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002719 outpos = p-PyUnicode_AS_UNICODE(v);
2720 startinpos = s-starts;
2721 endinpos = startinpos+1;
2722 if (unicode_decode_call_errorhandler(
2723 errors, &errorHandler,
2724 "charmap", "character maps to <undefined>",
2725 starts, size, &startinpos, &endinpos, &exc, &s,
2726 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727 Py_DECREF(x);
2728 goto onError;
2729 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002730 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 }
2732 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002733 int targetsize = PyUnicode_GET_SIZE(x);
2734
2735 if (targetsize == 1)
2736 /* 1-1 mapping */
2737 *p++ = *PyUnicode_AS_UNICODE(x);
2738
2739 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002741 if (targetsize > extrachars) {
2742 /* resize first */
2743 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2744 int needed = (targetsize - extrachars) + \
2745 (targetsize << 2);
2746 extrachars += needed;
Tim Petersced69f82003-09-16 20:30:58 +00002747 if (_PyUnicode_Resize(&v,
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002748 PyUnicode_GET_SIZE(v) + needed) < 0) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002749 Py_DECREF(x);
2750 goto onError;
2751 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002752 p = PyUnicode_AS_UNICODE(v) + oldpos;
2753 }
2754 Py_UNICODE_COPY(p,
2755 PyUnicode_AS_UNICODE(x),
2756 targetsize);
2757 p += targetsize;
2758 extrachars -= targetsize;
2759 }
2760 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 }
2762 else {
2763 /* wrong return value */
2764 PyErr_SetString(PyExc_TypeError,
2765 "character mapping must return integer, None or unicode");
2766 Py_DECREF(x);
2767 goto onError;
2768 }
2769 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002770 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002771 }
2772 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002773 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002775 Py_XDECREF(errorHandler);
2776 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002778
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002780 Py_XDECREF(errorHandler);
2781 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002782 Py_XDECREF(v);
2783 return NULL;
2784}
2785
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002786/* Lookup the character ch in the mapping. If the character
2787 can't be found, Py_None is returned (or NULL, if another
2788 error occured). */
2789static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002791 PyObject *w = PyInt_FromLong((long)c);
2792 PyObject *x;
2793
2794 if (w == NULL)
2795 return NULL;
2796 x = PyObject_GetItem(mapping, w);
2797 Py_DECREF(w);
2798 if (x == NULL) {
2799 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2800 /* No mapping found means: mapping is undefined. */
2801 PyErr_Clear();
2802 x = Py_None;
2803 Py_INCREF(x);
2804 return x;
2805 } else
2806 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002808 else if (x == Py_None)
2809 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002810 else if (PyInt_Check(x)) {
2811 long value = PyInt_AS_LONG(x);
2812 if (value < 0 || value > 255) {
2813 PyErr_SetString(PyExc_TypeError,
2814 "character mapping must be in range(256)");
2815 Py_DECREF(x);
2816 return NULL;
2817 }
2818 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002820 else if (PyString_Check(x))
2821 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002823 /* wrong return value */
2824 PyErr_SetString(PyExc_TypeError,
2825 "character mapping must return integer, None or str");
2826 Py_DECREF(x);
2827 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828 }
2829}
2830
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002831/* lookup the character, put the result in the output string and adjust
2832 various state variables. Reallocate the output string if not enough
2833 space is available. Return a new reference to the object that
2834 was put in the output buffer, or Py_None, if the mapping was undefined
2835 (in which case no character was written) or NULL, if a
2836 reallocation error ocurred. The called must decref the result */
2837static
2838PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2839 PyObject **outobj, int *outpos)
2840{
2841 PyObject *rep = charmapencode_lookup(c, mapping);
2842
2843 if (rep==NULL)
2844 return NULL;
2845 else if (rep==Py_None)
2846 return rep;
2847 else {
2848 char *outstart = PyString_AS_STRING(*outobj);
2849 int outsize = PyString_GET_SIZE(*outobj);
2850 if (PyInt_Check(rep)) {
2851 int requiredsize = *outpos+1;
2852 if (outsize<requiredsize) {
2853 /* exponentially overallocate to minimize reallocations */
2854 if (requiredsize < 2*outsize)
2855 requiredsize = 2*outsize;
2856 if (_PyString_Resize(outobj, requiredsize)) {
2857 Py_DECREF(rep);
2858 return NULL;
2859 }
2860 outstart = PyString_AS_STRING(*outobj);
2861 }
2862 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2863 }
2864 else {
2865 const char *repchars = PyString_AS_STRING(rep);
2866 int repsize = PyString_GET_SIZE(rep);
2867 int requiredsize = *outpos+repsize;
2868 if (outsize<requiredsize) {
2869 /* exponentially overallocate to minimize reallocations */
2870 if (requiredsize < 2*outsize)
2871 requiredsize = 2*outsize;
2872 if (_PyString_Resize(outobj, requiredsize)) {
2873 Py_DECREF(rep);
2874 return NULL;
2875 }
2876 outstart = PyString_AS_STRING(*outobj);
2877 }
2878 memcpy(outstart + *outpos, repchars, repsize);
2879 *outpos += repsize;
2880 }
2881 }
2882 return rep;
2883}
2884
2885/* handle an error in PyUnicode_EncodeCharmap
2886 Return 0 on success, -1 on error */
2887static
2888int charmap_encoding_error(
2889 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2890 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002891 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002892 PyObject **res, int *respos)
2893{
2894 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2895 int repsize;
2896 int newpos;
2897 Py_UNICODE *uni2;
2898 /* startpos for collecting unencodable chars */
2899 int collstartpos = *inpos;
2900 int collendpos = *inpos+1;
2901 int collpos;
2902 char *encoding = "charmap";
2903 char *reason = "character maps to <undefined>";
2904
2905 PyObject *x;
2906 /* find all unencodable characters */
2907 while (collendpos < size) {
2908 x = charmapencode_lookup(p[collendpos], mapping);
2909 if (x==NULL)
2910 return -1;
2911 else if (x!=Py_None) {
2912 Py_DECREF(x);
2913 break;
2914 }
2915 Py_DECREF(x);
2916 ++collendpos;
2917 }
2918 /* cache callback name lookup
2919 * (if not done yet, i.e. it's the first error) */
2920 if (*known_errorHandler==-1) {
2921 if ((errors==NULL) || (!strcmp(errors, "strict")))
2922 *known_errorHandler = 1;
2923 else if (!strcmp(errors, "replace"))
2924 *known_errorHandler = 2;
2925 else if (!strcmp(errors, "ignore"))
2926 *known_errorHandler = 3;
2927 else if (!strcmp(errors, "xmlcharrefreplace"))
2928 *known_errorHandler = 4;
2929 else
2930 *known_errorHandler = 0;
2931 }
2932 switch (*known_errorHandler) {
2933 case 1: /* strict */
2934 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2935 return -1;
2936 case 2: /* replace */
2937 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2938 x = charmapencode_output('?', mapping, res, respos);
2939 if (x==NULL) {
2940 return -1;
2941 }
2942 else if (x==Py_None) {
2943 Py_DECREF(x);
2944 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2945 return -1;
2946 }
2947 Py_DECREF(x);
2948 }
2949 /* fall through */
2950 case 3: /* ignore */
2951 *inpos = collendpos;
2952 break;
2953 case 4: /* xmlcharrefreplace */
2954 /* generate replacement (temporarily (mis)uses p) */
2955 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2956 char buffer[2+29+1+1];
2957 char *cp;
2958 sprintf(buffer, "&#%d;", (int)p[collpos]);
2959 for (cp = buffer; *cp; ++cp) {
2960 x = charmapencode_output(*cp, mapping, res, respos);
2961 if (x==NULL)
2962 return -1;
2963 else if (x==Py_None) {
2964 Py_DECREF(x);
2965 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2966 return -1;
2967 }
2968 Py_DECREF(x);
2969 }
2970 }
2971 *inpos = collendpos;
2972 break;
2973 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002974 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002975 encoding, reason, p, size, exceptionObject,
2976 collstartpos, collendpos, &newpos);
2977 if (repunicode == NULL)
2978 return -1;
2979 /* generate replacement */
2980 repsize = PyUnicode_GET_SIZE(repunicode);
2981 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2982 x = charmapencode_output(*uni2, mapping, res, respos);
2983 if (x==NULL) {
2984 Py_DECREF(repunicode);
2985 return -1;
2986 }
2987 else if (x==Py_None) {
2988 Py_DECREF(repunicode);
2989 Py_DECREF(x);
2990 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2991 return -1;
2992 }
2993 Py_DECREF(x);
2994 }
2995 *inpos = newpos;
2996 Py_DECREF(repunicode);
2997 }
2998 return 0;
2999}
3000
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3002 int size,
3003 PyObject *mapping,
3004 const char *errors)
3005{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003006 /* output object */
3007 PyObject *res = NULL;
3008 /* current input position */
3009 int inpos = 0;
3010 /* current output position */
3011 int respos = 0;
3012 PyObject *errorHandler = NULL;
3013 PyObject *exc = NULL;
3014 /* the following variable is used for caching string comparisons
3015 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3016 * 3=ignore, 4=xmlcharrefreplace */
3017 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018
3019 /* Default to Latin-1 */
3020 if (mapping == NULL)
3021 return PyUnicode_EncodeLatin1(p, size, errors);
3022
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003023 /* allocate enough for a simple encoding without
3024 replacements, if we need more, we'll resize */
3025 res = PyString_FromStringAndSize(NULL, size);
3026 if (res == NULL)
3027 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003028 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003029 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003031 while (inpos<size) {
3032 /* try to encode it */
3033 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3034 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003036 if (x==Py_None) { /* unencodable character */
3037 if (charmap_encoding_error(p, size, &inpos, mapping,
3038 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003039 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003040 &res, &respos)) {
3041 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003042 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003043 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003045 else
3046 /* done with this character => adjust input position */
3047 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048 Py_DECREF(x);
3049 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051 /* Resize if we allocated to much */
3052 if (respos<PyString_GET_SIZE(res)) {
3053 if (_PyString_Resize(&res, respos))
3054 goto onError;
3055 }
3056 Py_XDECREF(exc);
3057 Py_XDECREF(errorHandler);
3058 return res;
3059
3060 onError:
3061 Py_XDECREF(res);
3062 Py_XDECREF(exc);
3063 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064 return NULL;
3065}
3066
3067PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3068 PyObject *mapping)
3069{
3070 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3071 PyErr_BadArgument();
3072 return NULL;
3073 }
3074 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3075 PyUnicode_GET_SIZE(unicode),
3076 mapping,
3077 NULL);
3078}
3079
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003080/* create or adjust a UnicodeTranslateError */
3081static void make_translate_exception(PyObject **exceptionObject,
3082 const Py_UNICODE *unicode, int size,
3083 int startpos, int endpos,
3084 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003086 if (*exceptionObject == NULL) {
3087 *exceptionObject = PyUnicodeTranslateError_Create(
3088 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089 }
3090 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003091 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3092 goto onError;
3093 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3094 goto onError;
3095 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3096 goto onError;
3097 return;
3098 onError:
3099 Py_DECREF(*exceptionObject);
3100 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101 }
3102}
3103
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003104/* raises a UnicodeTranslateError */
3105static void raise_translate_exception(PyObject **exceptionObject,
3106 const Py_UNICODE *unicode, int size,
3107 int startpos, int endpos,
3108 const char *reason)
3109{
3110 make_translate_exception(exceptionObject,
3111 unicode, size, startpos, endpos, reason);
3112 if (*exceptionObject != NULL)
3113 PyCodec_StrictErrors(*exceptionObject);
3114}
3115
3116/* error handling callback helper:
3117 build arguments, call the callback and check the arguments,
3118 put the result into newpos and return the replacement string, which
3119 has to be freed by the caller */
3120static PyObject *unicode_translate_call_errorhandler(const char *errors,
3121 PyObject **errorHandler,
3122 const char *reason,
3123 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3124 int startpos, int endpos,
3125 int *newpos)
3126{
3127 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3128
3129 PyObject *restuple;
3130 PyObject *resunicode;
3131
3132 if (*errorHandler == NULL) {
3133 *errorHandler = PyCodec_LookupError(errors);
3134 if (*errorHandler == NULL)
3135 return NULL;
3136 }
3137
3138 make_translate_exception(exceptionObject,
3139 unicode, size, startpos, endpos, reason);
3140 if (*exceptionObject == NULL)
3141 return NULL;
3142
3143 restuple = PyObject_CallFunctionObjArgs(
3144 *errorHandler, *exceptionObject, NULL);
3145 if (restuple == NULL)
3146 return NULL;
3147 if (!PyTuple_Check(restuple)) {
3148 PyErr_Format(PyExc_TypeError, &argparse[4]);
3149 Py_DECREF(restuple);
3150 return NULL;
3151 }
3152 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3153 &resunicode, newpos)) {
3154 Py_DECREF(restuple);
3155 return NULL;
3156 }
3157 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003158 *newpos = size+*newpos;
3159 if (*newpos<0 || *newpos>size) {
3160 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3161 Py_DECREF(restuple);
3162 return NULL;
3163 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003164 Py_INCREF(resunicode);
3165 Py_DECREF(restuple);
3166 return resunicode;
3167}
3168
3169/* Lookup the character ch in the mapping and put the result in result,
3170 which must be decrefed by the caller.
3171 Return 0 on success, -1 on error */
3172static
3173int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3174{
3175 PyObject *w = PyInt_FromLong((long)c);
3176 PyObject *x;
3177
3178 if (w == NULL)
3179 return -1;
3180 x = PyObject_GetItem(mapping, w);
3181 Py_DECREF(w);
3182 if (x == NULL) {
3183 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3184 /* No mapping found means: use 1:1 mapping. */
3185 PyErr_Clear();
3186 *result = NULL;
3187 return 0;
3188 } else
3189 return -1;
3190 }
3191 else if (x == Py_None) {
3192 *result = x;
3193 return 0;
3194 }
3195 else if (PyInt_Check(x)) {
3196 long value = PyInt_AS_LONG(x);
3197 long max = PyUnicode_GetMax();
3198 if (value < 0 || value > max) {
3199 PyErr_Format(PyExc_TypeError,
3200 "character mapping must be in range(0x%lx)", max+1);
3201 Py_DECREF(x);
3202 return -1;
3203 }
3204 *result = x;
3205 return 0;
3206 }
3207 else if (PyUnicode_Check(x)) {
3208 *result = x;
3209 return 0;
3210 }
3211 else {
3212 /* wrong return value */
3213 PyErr_SetString(PyExc_TypeError,
3214 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003215 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003216 return -1;
3217 }
3218}
3219/* ensure that *outobj is at least requiredsize characters long,
3220if not reallocate and adjust various state variables.
3221Return 0 on success, -1 on error */
3222static
Walter Dörwald4894c302003-10-24 14:25:28 +00003223int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003224 int requiredsize)
3225{
Walter Dörwald4894c302003-10-24 14:25:28 +00003226 int oldsize = PyUnicode_GET_SIZE(*outobj);
3227 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003228 /* remember old output position */
3229 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3230 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003231 if (requiredsize < 2 * oldsize)
3232 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003233 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003234 return -1;
3235 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236 }
3237 return 0;
3238}
3239/* lookup the character, put the result in the output string and adjust
3240 various state variables. Return a new reference to the object that
3241 was put in the output buffer in *result, or Py_None, if the mapping was
3242 undefined (in which case no character was written).
3243 The called must decref result.
3244 Return 0 on success, -1 on error. */
3245static
Walter Dörwald4894c302003-10-24 14:25:28 +00003246int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3247 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3248 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003249{
Walter Dörwald4894c302003-10-24 14:25:28 +00003250 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003251 return -1;
3252 if (*res==NULL) {
3253 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003254 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003255 }
3256 else if (*res==Py_None)
3257 ;
3258 else if (PyInt_Check(*res)) {
3259 /* no overflow check, because we know that the space is enough */
3260 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3261 }
3262 else if (PyUnicode_Check(*res)) {
3263 int repsize = PyUnicode_GET_SIZE(*res);
3264 if (repsize==1) {
3265 /* no overflow check, because we know that the space is enough */
3266 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3267 }
3268 else if (repsize!=0) {
3269 /* more than one character */
Walter Dörwald4894c302003-10-24 14:25:28 +00003270 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003271 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003272 repsize - 1;
3273 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003274 return -1;
3275 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3276 *outp += repsize;
3277 }
3278 }
3279 else
3280 return -1;
3281 return 0;
3282}
3283
3284PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003285 int size,
3286 PyObject *mapping,
3287 const char *errors)
3288{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003289 /* output object */
3290 PyObject *res = NULL;
3291 /* pointers to the beginning and end+1 of input */
3292 const Py_UNICODE *startp = p;
3293 const Py_UNICODE *endp = p + size;
3294 /* pointer into the output */
3295 Py_UNICODE *str;
3296 /* current output position */
3297 int respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003298 char *reason = "character maps to <undefined>";
3299 PyObject *errorHandler = NULL;
3300 PyObject *exc = NULL;
3301 /* the following variable is used for caching string comparisons
3302 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3303 * 3=ignore, 4=xmlcharrefreplace */
3304 int known_errorHandler = -1;
3305
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 if (mapping == NULL) {
3307 PyErr_BadArgument();
3308 return NULL;
3309 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003310
3311 /* allocate enough for a simple 1:1 translation without
3312 replacements, if we need more, we'll resize */
3313 res = PyUnicode_FromUnicode(NULL, size);
3314 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003315 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003317 return res;
3318 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003319
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003320 while (p<endp) {
3321 /* try to encode it */
3322 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003323 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003324 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325 goto onError;
3326 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003327 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003328 if (x!=Py_None) /* it worked => adjust input pointer */
3329 ++p;
3330 else { /* untranslatable character */
3331 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3332 int repsize;
3333 int newpos;
3334 Py_UNICODE *uni2;
3335 /* startpos for collecting untranslatable chars */
3336 const Py_UNICODE *collstart = p;
3337 const Py_UNICODE *collend = p+1;
3338 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003340 /* find all untranslatable characters */
3341 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003342 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343 goto onError;
3344 Py_XDECREF(x);
3345 if (x!=Py_None)
3346 break;
3347 ++collend;
3348 }
3349 /* cache callback name lookup
3350 * (if not done yet, i.e. it's the first error) */
3351 if (known_errorHandler==-1) {
3352 if ((errors==NULL) || (!strcmp(errors, "strict")))
3353 known_errorHandler = 1;
3354 else if (!strcmp(errors, "replace"))
3355 known_errorHandler = 2;
3356 else if (!strcmp(errors, "ignore"))
3357 known_errorHandler = 3;
3358 else if (!strcmp(errors, "xmlcharrefreplace"))
3359 known_errorHandler = 4;
3360 else
3361 known_errorHandler = 0;
3362 }
3363 switch (known_errorHandler) {
3364 case 1: /* strict */
3365 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3366 goto onError;
3367 case 2: /* replace */
3368 /* No need to check for space, this is a 1:1 replacement */
3369 for (coll = collstart; coll<collend; ++coll)
3370 *str++ = '?';
3371 /* fall through */
3372 case 3: /* ignore */
3373 p = collend;
3374 break;
3375 case 4: /* xmlcharrefreplace */
3376 /* generate replacement (temporarily (mis)uses p) */
3377 for (p = collstart; p < collend; ++p) {
3378 char buffer[2+29+1+1];
3379 char *cp;
3380 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003381 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003382 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3383 goto onError;
3384 for (cp = buffer; *cp; ++cp)
3385 *str++ = *cp;
3386 }
3387 p = collend;
3388 break;
3389 default:
3390 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3391 reason, startp, size, &exc,
3392 collstart-startp, collend-startp, &newpos);
3393 if (repunicode == NULL)
3394 goto onError;
3395 /* generate replacement */
3396 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003397 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003398 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3399 Py_DECREF(repunicode);
3400 goto onError;
3401 }
3402 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3403 *str++ = *uni2;
3404 p = startp + newpos;
3405 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003406 }
3407 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003409 /* Resize if we allocated to much */
3410 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003411 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003412 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003413 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003414 }
3415 Py_XDECREF(exc);
3416 Py_XDECREF(errorHandler);
3417 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003418
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419 onError:
3420 Py_XDECREF(res);
3421 Py_XDECREF(exc);
3422 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003423 return NULL;
3424}
3425
3426PyObject *PyUnicode_Translate(PyObject *str,
3427 PyObject *mapping,
3428 const char *errors)
3429{
3430 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003431
Guido van Rossumd57fd912000-03-10 22:53:23 +00003432 str = PyUnicode_FromObject(str);
3433 if (str == NULL)
3434 goto onError;
3435 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3436 PyUnicode_GET_SIZE(str),
3437 mapping,
3438 errors);
3439 Py_DECREF(str);
3440 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003441
Guido van Rossumd57fd912000-03-10 22:53:23 +00003442 onError:
3443 Py_XDECREF(str);
3444 return NULL;
3445}
Tim Petersced69f82003-09-16 20:30:58 +00003446
Guido van Rossum9e896b32000-04-05 20:11:21 +00003447/* --- Decimal Encoder ---------------------------------------------------- */
3448
3449int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3450 int length,
3451 char *output,
3452 const char *errors)
3453{
3454 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003455 PyObject *errorHandler = NULL;
3456 PyObject *exc = NULL;
3457 const char *encoding = "decimal";
3458 const char *reason = "invalid decimal Unicode string";
3459 /* the following variable is used for caching string comparisons
3460 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3461 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003462
3463 if (output == NULL) {
3464 PyErr_BadArgument();
3465 return -1;
3466 }
3467
3468 p = s;
3469 end = s + length;
3470 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003471 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003472 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003473 PyObject *repunicode;
3474 int repsize;
3475 int newpos;
3476 Py_UNICODE *uni2;
3477 Py_UNICODE *collstart;
3478 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003479
Guido van Rossum9e896b32000-04-05 20:11:21 +00003480 if (Py_UNICODE_ISSPACE(ch)) {
3481 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003482 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003483 continue;
3484 }
3485 decimal = Py_UNICODE_TODECIMAL(ch);
3486 if (decimal >= 0) {
3487 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003488 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003489 continue;
3490 }
Guido van Rossumba477042000-04-06 18:18:10 +00003491 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003492 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003493 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003494 continue;
3495 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003496 /* All other characters are considered unencodable */
3497 collstart = p;
3498 collend = p+1;
3499 while (collend < end) {
3500 if ((0 < *collend && *collend < 256) ||
3501 !Py_UNICODE_ISSPACE(*collend) ||
3502 Py_UNICODE_TODECIMAL(*collend))
3503 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003504 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505 /* cache callback name lookup
3506 * (if not done yet, i.e. it's the first error) */
3507 if (known_errorHandler==-1) {
3508 if ((errors==NULL) || (!strcmp(errors, "strict")))
3509 known_errorHandler = 1;
3510 else if (!strcmp(errors, "replace"))
3511 known_errorHandler = 2;
3512 else if (!strcmp(errors, "ignore"))
3513 known_errorHandler = 3;
3514 else if (!strcmp(errors, "xmlcharrefreplace"))
3515 known_errorHandler = 4;
3516 else
3517 known_errorHandler = 0;
3518 }
3519 switch (known_errorHandler) {
3520 case 1: /* strict */
3521 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3522 goto onError;
3523 case 2: /* replace */
3524 for (p = collstart; p < collend; ++p)
3525 *output++ = '?';
3526 /* fall through */
3527 case 3: /* ignore */
3528 p = collend;
3529 break;
3530 case 4: /* xmlcharrefreplace */
3531 /* generate replacement (temporarily (mis)uses p) */
3532 for (p = collstart; p < collend; ++p)
3533 output += sprintf(output, "&#%d;", (int)*p);
3534 p = collend;
3535 break;
3536 default:
3537 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3538 encoding, reason, s, length, &exc,
3539 collstart-s, collend-s, &newpos);
3540 if (repunicode == NULL)
3541 goto onError;
3542 /* generate replacement */
3543 repsize = PyUnicode_GET_SIZE(repunicode);
3544 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3545 Py_UNICODE ch = *uni2;
3546 if (Py_UNICODE_ISSPACE(ch))
3547 *output++ = ' ';
3548 else {
3549 decimal = Py_UNICODE_TODECIMAL(ch);
3550 if (decimal >= 0)
3551 *output++ = '0' + decimal;
3552 else if (0 < ch && ch < 256)
3553 *output++ = (char)ch;
3554 else {
3555 Py_DECREF(repunicode);
3556 raise_encode_exception(&exc, encoding,
3557 s, length, collstart-s, collend-s, reason);
3558 goto onError;
3559 }
3560 }
3561 }
3562 p = s + newpos;
3563 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003564 }
3565 }
3566 /* 0-terminate the output string */
3567 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003568 Py_XDECREF(exc);
3569 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003570 return 0;
3571
3572 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573 Py_XDECREF(exc);
3574 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003575 return -1;
3576}
3577
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578/* --- Helpers ------------------------------------------------------------ */
3579
Tim Petersced69f82003-09-16 20:30:58 +00003580static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003581int count(PyUnicodeObject *self,
3582 int start,
3583 int end,
3584 PyUnicodeObject *substring)
3585{
3586 int count = 0;
3587
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003588 if (start < 0)
3589 start += self->length;
3590 if (start < 0)
3591 start = 0;
3592 if (end > self->length)
3593 end = self->length;
3594 if (end < 0)
3595 end += self->length;
3596 if (end < 0)
3597 end = 0;
3598
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003599 if (substring->length == 0)
3600 return (end - start + 1);
3601
Guido van Rossumd57fd912000-03-10 22:53:23 +00003602 end -= substring->length;
3603
3604 while (start <= end)
3605 if (Py_UNICODE_MATCH(self, start, substring)) {
3606 count++;
3607 start += substring->length;
3608 } else
3609 start++;
3610
3611 return count;
3612}
3613
3614int PyUnicode_Count(PyObject *str,
3615 PyObject *substr,
3616 int start,
3617 int end)
3618{
3619 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003620
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621 str = PyUnicode_FromObject(str);
3622 if (str == NULL)
3623 return -1;
3624 substr = PyUnicode_FromObject(substr);
3625 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003626 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003627 return -1;
3628 }
Tim Petersced69f82003-09-16 20:30:58 +00003629
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630 result = count((PyUnicodeObject *)str,
3631 start, end,
3632 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003633
Guido van Rossumd57fd912000-03-10 22:53:23 +00003634 Py_DECREF(str);
3635 Py_DECREF(substr);
3636 return result;
3637}
3638
Tim Petersced69f82003-09-16 20:30:58 +00003639static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003640int findstring(PyUnicodeObject *self,
3641 PyUnicodeObject *substring,
3642 int start,
3643 int end,
3644 int direction)
3645{
3646 if (start < 0)
3647 start += self->length;
3648 if (start < 0)
3649 start = 0;
3650
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 if (end > self->length)
3652 end = self->length;
3653 if (end < 0)
3654 end += self->length;
3655 if (end < 0)
3656 end = 0;
3657
Guido van Rossum76afbd92002-08-20 17:29:29 +00003658 if (substring->length == 0)
3659 return (direction > 0) ? start : end;
3660
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 end -= substring->length;
3662
3663 if (direction < 0) {
3664 for (; end >= start; end--)
3665 if (Py_UNICODE_MATCH(self, end, substring))
3666 return end;
3667 } else {
3668 for (; start <= end; start++)
3669 if (Py_UNICODE_MATCH(self, start, substring))
3670 return start;
3671 }
3672
3673 return -1;
3674}
3675
3676int PyUnicode_Find(PyObject *str,
3677 PyObject *substr,
3678 int start,
3679 int end,
3680 int direction)
3681{
3682 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003683
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684 str = PyUnicode_FromObject(str);
3685 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003686 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687 substr = PyUnicode_FromObject(substr);
3688 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003689 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003690 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691 }
Tim Petersced69f82003-09-16 20:30:58 +00003692
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693 result = findstring((PyUnicodeObject *)str,
3694 (PyUnicodeObject *)substr,
3695 start, end, direction);
3696 Py_DECREF(str);
3697 Py_DECREF(substr);
3698 return result;
3699}
3700
Tim Petersced69f82003-09-16 20:30:58 +00003701static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702int tailmatch(PyUnicodeObject *self,
3703 PyUnicodeObject *substring,
3704 int start,
3705 int end,
3706 int direction)
3707{
3708 if (start < 0)
3709 start += self->length;
3710 if (start < 0)
3711 start = 0;
3712
3713 if (substring->length == 0)
3714 return 1;
3715
3716 if (end > self->length)
3717 end = self->length;
3718 if (end < 0)
3719 end += self->length;
3720 if (end < 0)
3721 end = 0;
3722
3723 end -= substring->length;
3724 if (end < start)
3725 return 0;
3726
3727 if (direction > 0) {
3728 if (Py_UNICODE_MATCH(self, end, substring))
3729 return 1;
3730 } else {
3731 if (Py_UNICODE_MATCH(self, start, substring))
3732 return 1;
3733 }
3734
3735 return 0;
3736}
3737
3738int PyUnicode_Tailmatch(PyObject *str,
3739 PyObject *substr,
3740 int start,
3741 int end,
3742 int direction)
3743{
3744 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003745
Guido van Rossumd57fd912000-03-10 22:53:23 +00003746 str = PyUnicode_FromObject(str);
3747 if (str == NULL)
3748 return -1;
3749 substr = PyUnicode_FromObject(substr);
3750 if (substr == NULL) {
3751 Py_DECREF(substr);
3752 return -1;
3753 }
Tim Petersced69f82003-09-16 20:30:58 +00003754
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755 result = tailmatch((PyUnicodeObject *)str,
3756 (PyUnicodeObject *)substr,
3757 start, end, direction);
3758 Py_DECREF(str);
3759 Py_DECREF(substr);
3760 return result;
3761}
3762
Tim Petersced69f82003-09-16 20:30:58 +00003763static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764const Py_UNICODE *findchar(const Py_UNICODE *s,
3765 int size,
3766 Py_UNICODE ch)
3767{
3768 /* like wcschr, but doesn't stop at NULL characters */
3769
3770 while (size-- > 0) {
3771 if (*s == ch)
3772 return s;
3773 s++;
3774 }
3775
3776 return NULL;
3777}
3778
3779/* Apply fixfct filter to the Unicode object self and return a
3780 reference to the modified object */
3781
Tim Petersced69f82003-09-16 20:30:58 +00003782static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783PyObject *fixup(PyUnicodeObject *self,
3784 int (*fixfct)(PyUnicodeObject *s))
3785{
3786
3787 PyUnicodeObject *u;
3788
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003789 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790 if (u == NULL)
3791 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003792
3793 Py_UNICODE_COPY(u->str, self->str, self->length);
3794
Tim Peters7a29bd52001-09-12 03:03:31 +00003795 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796 /* fixfct should return TRUE if it modified the buffer. If
3797 FALSE, return a reference to the original buffer instead
3798 (to save space, not time) */
3799 Py_INCREF(self);
3800 Py_DECREF(u);
3801 return (PyObject*) self;
3802 }
3803 return (PyObject*) u;
3804}
3805
Tim Petersced69f82003-09-16 20:30:58 +00003806static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003807int fixupper(PyUnicodeObject *self)
3808{
3809 int len = self->length;
3810 Py_UNICODE *s = self->str;
3811 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003812
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813 while (len-- > 0) {
3814 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003815
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 ch = Py_UNICODE_TOUPPER(*s);
3817 if (ch != *s) {
3818 status = 1;
3819 *s = ch;
3820 }
3821 s++;
3822 }
3823
3824 return status;
3825}
3826
Tim Petersced69f82003-09-16 20:30:58 +00003827static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828int fixlower(PyUnicodeObject *self)
3829{
3830 int len = self->length;
3831 Py_UNICODE *s = self->str;
3832 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003833
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834 while (len-- > 0) {
3835 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003836
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 ch = Py_UNICODE_TOLOWER(*s);
3838 if (ch != *s) {
3839 status = 1;
3840 *s = ch;
3841 }
3842 s++;
3843 }
3844
3845 return status;
3846}
3847
Tim Petersced69f82003-09-16 20:30:58 +00003848static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849int fixswapcase(PyUnicodeObject *self)
3850{
3851 int len = self->length;
3852 Py_UNICODE *s = self->str;
3853 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003854
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855 while (len-- > 0) {
3856 if (Py_UNICODE_ISUPPER(*s)) {
3857 *s = Py_UNICODE_TOLOWER(*s);
3858 status = 1;
3859 } else if (Py_UNICODE_ISLOWER(*s)) {
3860 *s = Py_UNICODE_TOUPPER(*s);
3861 status = 1;
3862 }
3863 s++;
3864 }
3865
3866 return status;
3867}
3868
Tim Petersced69f82003-09-16 20:30:58 +00003869static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870int fixcapitalize(PyUnicodeObject *self)
3871{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003872 int len = self->length;
3873 Py_UNICODE *s = self->str;
3874 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003875
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003876 if (len == 0)
3877 return 0;
3878 if (Py_UNICODE_ISLOWER(*s)) {
3879 *s = Py_UNICODE_TOUPPER(*s);
3880 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003881 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003882 s++;
3883 while (--len > 0) {
3884 if (Py_UNICODE_ISUPPER(*s)) {
3885 *s = Py_UNICODE_TOLOWER(*s);
3886 status = 1;
3887 }
3888 s++;
3889 }
3890 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891}
3892
3893static
3894int fixtitle(PyUnicodeObject *self)
3895{
3896 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3897 register Py_UNICODE *e;
3898 int previous_is_cased;
3899
3900 /* Shortcut for single character strings */
3901 if (PyUnicode_GET_SIZE(self) == 1) {
3902 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3903 if (*p != ch) {
3904 *p = ch;
3905 return 1;
3906 }
3907 else
3908 return 0;
3909 }
Tim Petersced69f82003-09-16 20:30:58 +00003910
Guido van Rossumd57fd912000-03-10 22:53:23 +00003911 e = p + PyUnicode_GET_SIZE(self);
3912 previous_is_cased = 0;
3913 for (; p < e; p++) {
3914 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00003915
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916 if (previous_is_cased)
3917 *p = Py_UNICODE_TOLOWER(ch);
3918 else
3919 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00003920
3921 if (Py_UNICODE_ISLOWER(ch) ||
3922 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923 Py_UNICODE_ISTITLE(ch))
3924 previous_is_cased = 1;
3925 else
3926 previous_is_cased = 0;
3927 }
3928 return 1;
3929}
3930
3931PyObject *PyUnicode_Join(PyObject *separator,
3932 PyObject *seq)
3933{
3934 Py_UNICODE *sep;
3935 int seplen;
3936 PyUnicodeObject *res = NULL;
3937 int reslen = 0;
3938 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003939 int sz = 100;
3940 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003941 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942
Tim Peters2cfe3682001-05-05 05:36:48 +00003943 it = PyObject_GetIter(seq);
3944 if (it == NULL)
3945 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003946
3947 if (separator == NULL) {
3948 Py_UNICODE blank = ' ';
3949 sep = &blank;
3950 seplen = 1;
3951 }
3952 else {
3953 separator = PyUnicode_FromObject(separator);
3954 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003955 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 sep = PyUnicode_AS_UNICODE(separator);
3957 seplen = PyUnicode_GET_SIZE(separator);
3958 }
Tim Petersced69f82003-09-16 20:30:58 +00003959
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960 res = _PyUnicode_New(sz);
3961 if (res == NULL)
3962 goto onError;
3963 p = PyUnicode_AS_UNICODE(res);
3964 reslen = 0;
3965
Tim Peters2cfe3682001-05-05 05:36:48 +00003966 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003968 PyObject *item = PyIter_Next(it);
3969 if (item == NULL) {
3970 if (PyErr_Occurred())
3971 goto onError;
3972 break;
3973 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003974 if (!PyUnicode_Check(item)) {
3975 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003976 if (!PyString_Check(item)) {
3977 PyErr_Format(PyExc_TypeError,
3978 "sequence item %i: expected string or Unicode,"
3979 " %.80s found",
3980 i, item->ob_type->tp_name);
3981 Py_DECREF(item);
3982 goto onError;
3983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984 v = PyUnicode_FromObject(item);
3985 Py_DECREF(item);
3986 item = v;
3987 if (item == NULL)
3988 goto onError;
3989 }
3990 itemlen = PyUnicode_GET_SIZE(item);
3991 while (reslen + itemlen + seplen >= sz) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003992 if (_PyUnicode_Resize(&res, sz*2) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003993 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003994 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003995 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996 sz *= 2;
3997 p = PyUnicode_AS_UNICODE(res) + reslen;
3998 }
3999 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004000 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004001 p += seplen;
4002 reslen += seplen;
4003 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004004 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005 p += itemlen;
4006 reslen += itemlen;
4007 Py_DECREF(item);
4008 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004009 if (_PyUnicode_Resize(&res, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 goto onError;
4011
4012 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004013 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014 return (PyObject *)res;
4015
4016 onError:
4017 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004018 Py_XDECREF(res);
4019 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004020 return NULL;
4021}
4022
Tim Petersced69f82003-09-16 20:30:58 +00004023static
4024PyUnicodeObject *pad(PyUnicodeObject *self,
4025 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026 int right,
4027 Py_UNICODE fill)
4028{
4029 PyUnicodeObject *u;
4030
4031 if (left < 0)
4032 left = 0;
4033 if (right < 0)
4034 right = 0;
4035
Tim Peters7a29bd52001-09-12 03:03:31 +00004036 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037 Py_INCREF(self);
4038 return self;
4039 }
4040
4041 u = _PyUnicode_New(left + self->length + right);
4042 if (u) {
4043 if (left)
4044 Py_UNICODE_FILL(u->str, fill, left);
4045 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4046 if (right)
4047 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4048 }
4049
4050 return u;
4051}
4052
4053#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004054 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055 if (!str) \
4056 goto onError; \
4057 if (PyList_Append(list, str)) { \
4058 Py_DECREF(str); \
4059 goto onError; \
4060 } \
4061 else \
4062 Py_DECREF(str);
4063
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004064#define SPLIT_INSERT(data, left, right) \
4065 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4066 if (!str) \
4067 goto onError; \
4068 if (PyList_Insert(list, 0, str)) { \
4069 Py_DECREF(str); \
4070 goto onError; \
4071 } \
4072 else \
4073 Py_DECREF(str);
4074
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075static
4076PyObject *split_whitespace(PyUnicodeObject *self,
4077 PyObject *list,
4078 int maxcount)
4079{
4080 register int i;
4081 register int j;
4082 int len = self->length;
4083 PyObject *str;
4084
4085 for (i = j = 0; i < len; ) {
4086 /* find a token */
4087 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4088 i++;
4089 j = i;
4090 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4091 i++;
4092 if (j < i) {
4093 if (maxcount-- <= 0)
4094 break;
4095 SPLIT_APPEND(self->str, j, i);
4096 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4097 i++;
4098 j = i;
4099 }
4100 }
4101 if (j < len) {
4102 SPLIT_APPEND(self->str, j, len);
4103 }
4104 return list;
4105
4106 onError:
4107 Py_DECREF(list);
4108 return NULL;
4109}
4110
4111PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004112 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113{
4114 register int i;
4115 register int j;
4116 int len;
4117 PyObject *list;
4118 PyObject *str;
4119 Py_UNICODE *data;
4120
4121 string = PyUnicode_FromObject(string);
4122 if (string == NULL)
4123 return NULL;
4124 data = PyUnicode_AS_UNICODE(string);
4125 len = PyUnicode_GET_SIZE(string);
4126
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127 list = PyList_New(0);
4128 if (!list)
4129 goto onError;
4130
4131 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004132 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004133
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134 /* Find a line and append it */
4135 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4136 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137
4138 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004139 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 if (i < len) {
4141 if (data[i] == '\r' && i + 1 < len &&
4142 data[i+1] == '\n')
4143 i += 2;
4144 else
4145 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004146 if (keepends)
4147 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148 }
Guido van Rossum86662912000-04-11 15:38:46 +00004149 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150 j = i;
4151 }
4152 if (j < len) {
4153 SPLIT_APPEND(data, j, len);
4154 }
4155
4156 Py_DECREF(string);
4157 return list;
4158
4159 onError:
4160 Py_DECREF(list);
4161 Py_DECREF(string);
4162 return NULL;
4163}
4164
Tim Petersced69f82003-09-16 20:30:58 +00004165static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166PyObject *split_char(PyUnicodeObject *self,
4167 PyObject *list,
4168 Py_UNICODE ch,
4169 int maxcount)
4170{
4171 register int i;
4172 register int j;
4173 int len = self->length;
4174 PyObject *str;
4175
4176 for (i = j = 0; i < len; ) {
4177 if (self->str[i] == ch) {
4178 if (maxcount-- <= 0)
4179 break;
4180 SPLIT_APPEND(self->str, j, i);
4181 i = j = i + 1;
4182 } else
4183 i++;
4184 }
4185 if (j <= len) {
4186 SPLIT_APPEND(self->str, j, len);
4187 }
4188 return list;
4189
4190 onError:
4191 Py_DECREF(list);
4192 return NULL;
4193}
4194
Tim Petersced69f82003-09-16 20:30:58 +00004195static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004196PyObject *split_substring(PyUnicodeObject *self,
4197 PyObject *list,
4198 PyUnicodeObject *substring,
4199 int maxcount)
4200{
4201 register int i;
4202 register int j;
4203 int len = self->length;
4204 int sublen = substring->length;
4205 PyObject *str;
4206
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004207 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004208 if (Py_UNICODE_MATCH(self, i, substring)) {
4209 if (maxcount-- <= 0)
4210 break;
4211 SPLIT_APPEND(self->str, j, i);
4212 i = j = i + sublen;
4213 } else
4214 i++;
4215 }
4216 if (j <= len) {
4217 SPLIT_APPEND(self->str, j, len);
4218 }
4219 return list;
4220
4221 onError:
4222 Py_DECREF(list);
4223 return NULL;
4224}
4225
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004226static
4227PyObject *rsplit_whitespace(PyUnicodeObject *self,
4228 PyObject *list,
4229 int maxcount)
4230{
4231 register int i;
4232 register int j;
4233 int len = self->length;
4234 PyObject *str;
4235
4236 for (i = j = len - 1; i >= 0; ) {
4237 /* find a token */
4238 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4239 i--;
4240 j = i;
4241 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4242 i--;
4243 if (j > i) {
4244 if (maxcount-- <= 0)
4245 break;
4246 SPLIT_INSERT(self->str, i + 1, j + 1);
4247 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4248 i--;
4249 j = i;
4250 }
4251 }
4252 if (j >= 0) {
4253 SPLIT_INSERT(self->str, 0, j + 1);
4254 }
4255 return list;
4256
4257 onError:
4258 Py_DECREF(list);
4259 return NULL;
4260}
4261
4262static
4263PyObject *rsplit_char(PyUnicodeObject *self,
4264 PyObject *list,
4265 Py_UNICODE ch,
4266 int maxcount)
4267{
4268 register int i;
4269 register int j;
4270 int len = self->length;
4271 PyObject *str;
4272
4273 for (i = j = len - 1; i >= 0; ) {
4274 if (self->str[i] == ch) {
4275 if (maxcount-- <= 0)
4276 break;
4277 SPLIT_INSERT(self->str, i + 1, j + 1);
4278 j = i = i - 1;
4279 } else
4280 i--;
4281 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004282 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004283 SPLIT_INSERT(self->str, 0, j + 1);
4284 }
4285 return list;
4286
4287 onError:
4288 Py_DECREF(list);
4289 return NULL;
4290}
4291
4292static
4293PyObject *rsplit_substring(PyUnicodeObject *self,
4294 PyObject *list,
4295 PyUnicodeObject *substring,
4296 int maxcount)
4297{
4298 register int i;
4299 register int j;
4300 int len = self->length;
4301 int sublen = substring->length;
4302 PyObject *str;
4303
4304 for (i = len - sublen, j = len; i >= 0; ) {
4305 if (Py_UNICODE_MATCH(self, i, substring)) {
4306 if (maxcount-- <= 0)
4307 break;
4308 SPLIT_INSERT(self->str, i + sublen, j);
4309 j = i;
4310 i -= sublen;
4311 } else
4312 i--;
4313 }
4314 if (j >= 0) {
4315 SPLIT_INSERT(self->str, 0, j);
4316 }
4317 return list;
4318
4319 onError:
4320 Py_DECREF(list);
4321 return NULL;
4322}
4323
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004325#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326
4327static
4328PyObject *split(PyUnicodeObject *self,
4329 PyUnicodeObject *substring,
4330 int maxcount)
4331{
4332 PyObject *list;
4333
4334 if (maxcount < 0)
4335 maxcount = INT_MAX;
4336
4337 list = PyList_New(0);
4338 if (!list)
4339 return NULL;
4340
4341 if (substring == NULL)
4342 return split_whitespace(self,list,maxcount);
4343
4344 else if (substring->length == 1)
4345 return split_char(self,list,substring->str[0],maxcount);
4346
4347 else if (substring->length == 0) {
4348 Py_DECREF(list);
4349 PyErr_SetString(PyExc_ValueError, "empty separator");
4350 return NULL;
4351 }
4352 else
4353 return split_substring(self,list,substring,maxcount);
4354}
4355
Tim Petersced69f82003-09-16 20:30:58 +00004356static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004357PyObject *rsplit(PyUnicodeObject *self,
4358 PyUnicodeObject *substring,
4359 int maxcount)
4360{
4361 PyObject *list;
4362
4363 if (maxcount < 0)
4364 maxcount = INT_MAX;
4365
4366 list = PyList_New(0);
4367 if (!list)
4368 return NULL;
4369
4370 if (substring == NULL)
4371 return rsplit_whitespace(self,list,maxcount);
4372
4373 else if (substring->length == 1)
4374 return rsplit_char(self,list,substring->str[0],maxcount);
4375
4376 else if (substring->length == 0) {
4377 Py_DECREF(list);
4378 PyErr_SetString(PyExc_ValueError, "empty separator");
4379 return NULL;
4380 }
4381 else
4382 return rsplit_substring(self,list,substring,maxcount);
4383}
4384
4385static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386PyObject *replace(PyUnicodeObject *self,
4387 PyUnicodeObject *str1,
4388 PyUnicodeObject *str2,
4389 int maxcount)
4390{
4391 PyUnicodeObject *u;
4392
4393 if (maxcount < 0)
4394 maxcount = INT_MAX;
4395
4396 if (str1->length == 1 && str2->length == 1) {
4397 int i;
4398
4399 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004400 if (!findchar(self->str, self->length, str1->str[0]) &&
4401 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004402 /* nothing to replace, return original string */
4403 Py_INCREF(self);
4404 u = self;
4405 } else {
4406 Py_UNICODE u1 = str1->str[0];
4407 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004408
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004410 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411 self->length
4412 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004413 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004414 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004415 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416 for (i = 0; i < u->length; i++)
4417 if (u->str[i] == u1) {
4418 if (--maxcount < 0)
4419 break;
4420 u->str[i] = u2;
4421 }
4422 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424
4425 } else {
4426 int n, i;
4427 Py_UNICODE *p;
4428
4429 /* replace strings */
4430 n = count(self, 0, self->length, str1);
4431 if (n > maxcount)
4432 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004433 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004435 if (PyUnicode_CheckExact(self)) {
4436 Py_INCREF(self);
4437 u = self;
4438 }
4439 else {
4440 u = (PyUnicodeObject *)
4441 PyUnicode_FromUnicode(self->str, self->length);
4442 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443 } else {
4444 u = _PyUnicode_New(
4445 self->length + n * (str2->length - str1->length));
4446 if (u) {
4447 i = 0;
4448 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004449 if (str1->length > 0) {
4450 while (i <= self->length - str1->length)
4451 if (Py_UNICODE_MATCH(self, i, str1)) {
4452 /* replace string segment */
4453 Py_UNICODE_COPY(p, str2->str, str2->length);
4454 p += str2->length;
4455 i += str1->length;
4456 if (--n <= 0) {
4457 /* copy remaining part */
4458 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4459 break;
4460 }
4461 } else
4462 *p++ = self->str[i++];
4463 } else {
4464 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465 Py_UNICODE_COPY(p, str2->str, str2->length);
4466 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004467 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004470 }
4471 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4472 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473 }
4474 }
4475 }
Tim Petersced69f82003-09-16 20:30:58 +00004476
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 return (PyObject *) u;
4478}
4479
4480/* --- Unicode Object Methods --------------------------------------------- */
4481
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004482PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004483"S.title() -> unicode\n\
4484\n\
4485Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004486characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004487
4488static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004489unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491 return fixup(self, fixtitle);
4492}
4493
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004494PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495"S.capitalize() -> unicode\n\
4496\n\
4497Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004498have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499
4500static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004501unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004503 return fixup(self, fixcapitalize);
4504}
4505
4506#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004507PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508"S.capwords() -> unicode\n\
4509\n\
4510Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004511normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512
4513static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004514unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515{
4516 PyObject *list;
4517 PyObject *item;
4518 int i;
4519
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520 /* Split into words */
4521 list = split(self, NULL, -1);
4522 if (!list)
4523 return NULL;
4524
4525 /* Capitalize each word */
4526 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4527 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4528 fixcapitalize);
4529 if (item == NULL)
4530 goto onError;
4531 Py_DECREF(PyList_GET_ITEM(list, i));
4532 PyList_SET_ITEM(list, i, item);
4533 }
4534
4535 /* Join the words to form a new string */
4536 item = PyUnicode_Join(NULL, list);
4537
4538onError:
4539 Py_DECREF(list);
4540 return (PyObject *)item;
4541}
4542#endif
4543
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004544/* Argument converter. Coerces to a single unicode character */
4545
4546static int
4547convert_uc(PyObject *obj, void *addr)
4548{
4549 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4550 PyObject *uniobj;
4551 Py_UNICODE *unistr;
4552
4553 uniobj = PyUnicode_FromObject(obj);
4554 if (uniobj == NULL) {
4555 PyErr_SetString(PyExc_TypeError,
4556 "The fill character cannot be converted to Unicode");
4557 return 0;
4558 }
4559 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4560 PyErr_SetString(PyExc_TypeError,
4561 "The fill character must be exactly one character long");
4562 Py_DECREF(uniobj);
4563 return 0;
4564 }
4565 unistr = PyUnicode_AS_UNICODE(uniobj);
4566 *fillcharloc = unistr[0];
4567 Py_DECREF(uniobj);
4568 return 1;
4569}
4570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004571PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004572"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004574Return S centered in a Unicode string of length width. Padding is\n\
4575done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004576
4577static PyObject *
4578unicode_center(PyUnicodeObject *self, PyObject *args)
4579{
4580 int marg, left;
4581 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004582 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004583
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004584 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585 return NULL;
4586
Tim Peters7a29bd52001-09-12 03:03:31 +00004587 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588 Py_INCREF(self);
4589 return (PyObject*) self;
4590 }
4591
4592 marg = width - self->length;
4593 left = marg / 2 + (marg & width & 1);
4594
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004595 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596}
4597
Marc-André Lemburge5034372000-08-08 08:04:29 +00004598#if 0
4599
4600/* This code should go into some future Unicode collation support
4601 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004602 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004603
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004604/* speedy UTF-16 code point order comparison */
4605/* gleaned from: */
4606/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4607
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004608static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004609{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004610 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004611 0, 0, 0, 0, 0, 0, 0, 0,
4612 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004613 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004614};
4615
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616static int
4617unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4618{
4619 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004620
Guido van Rossumd57fd912000-03-10 22:53:23 +00004621 Py_UNICODE *s1 = str1->str;
4622 Py_UNICODE *s2 = str2->str;
4623
4624 len1 = str1->length;
4625 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004626
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004628 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004629
4630 c1 = *s1++;
4631 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004632
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004633 if (c1 > (1<<11) * 26)
4634 c1 += utf16Fixup[c1>>11];
4635 if (c2 > (1<<11) * 26)
4636 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004637 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004638
4639 if (c1 != c2)
4640 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004641
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004642 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004643 }
4644
4645 return (len1 < len2) ? -1 : (len1 != len2);
4646}
4647
Marc-André Lemburge5034372000-08-08 08:04:29 +00004648#else
4649
4650static int
4651unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4652{
4653 register int len1, len2;
4654
4655 Py_UNICODE *s1 = str1->str;
4656 Py_UNICODE *s2 = str2->str;
4657
4658 len1 = str1->length;
4659 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004660
Marc-André Lemburge5034372000-08-08 08:04:29 +00004661 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004662 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004663
Fredrik Lundh45714e92001-06-26 16:39:36 +00004664 c1 = *s1++;
4665 c2 = *s2++;
4666
4667 if (c1 != c2)
4668 return (c1 < c2) ? -1 : 1;
4669
Marc-André Lemburge5034372000-08-08 08:04:29 +00004670 len1--; len2--;
4671 }
4672
4673 return (len1 < len2) ? -1 : (len1 != len2);
4674}
4675
4676#endif
4677
Guido van Rossumd57fd912000-03-10 22:53:23 +00004678int PyUnicode_Compare(PyObject *left,
4679 PyObject *right)
4680{
4681 PyUnicodeObject *u = NULL, *v = NULL;
4682 int result;
4683
4684 /* Coerce the two arguments */
4685 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4686 if (u == NULL)
4687 goto onError;
4688 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4689 if (v == NULL)
4690 goto onError;
4691
Thomas Wouters7e474022000-07-16 12:04:32 +00004692 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693 if (v == u) {
4694 Py_DECREF(u);
4695 Py_DECREF(v);
4696 return 0;
4697 }
4698
4699 result = unicode_compare(u, v);
4700
4701 Py_DECREF(u);
4702 Py_DECREF(v);
4703 return result;
4704
4705onError:
4706 Py_XDECREF(u);
4707 Py_XDECREF(v);
4708 return -1;
4709}
4710
Guido van Rossum403d68b2000-03-13 15:55:09 +00004711int PyUnicode_Contains(PyObject *container,
4712 PyObject *element)
4713{
4714 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004715 int result, size;
4716 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004717
4718 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004719 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004720 if (v == NULL) {
4721 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004722 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004723 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004724 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004725 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004726 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004727 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004728
Barry Warsaw817918c2002-08-06 16:58:21 +00004729 size = PyUnicode_GET_SIZE(v);
4730 rhs = PyUnicode_AS_UNICODE(v);
4731 lhs = PyUnicode_AS_UNICODE(u);
4732
Guido van Rossum403d68b2000-03-13 15:55:09 +00004733 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004734 if (size == 1) {
4735 end = lhs + PyUnicode_GET_SIZE(u);
4736 while (lhs < end) {
4737 if (*lhs++ == *rhs) {
4738 result = 1;
4739 break;
4740 }
4741 }
4742 }
4743 else {
4744 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4745 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004746 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004747 result = 1;
4748 break;
4749 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004750 }
4751 }
4752
4753 Py_DECREF(u);
4754 Py_DECREF(v);
4755 return result;
4756
4757onError:
4758 Py_XDECREF(u);
4759 Py_XDECREF(v);
4760 return -1;
4761}
4762
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763/* Concat to string or Unicode object giving a new Unicode object. */
4764
4765PyObject *PyUnicode_Concat(PyObject *left,
4766 PyObject *right)
4767{
4768 PyUnicodeObject *u = NULL, *v = NULL, *w;
4769
4770 /* Coerce the two arguments */
4771 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4772 if (u == NULL)
4773 goto onError;
4774 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4775 if (v == NULL)
4776 goto onError;
4777
4778 /* Shortcuts */
4779 if (v == unicode_empty) {
4780 Py_DECREF(v);
4781 return (PyObject *)u;
4782 }
4783 if (u == unicode_empty) {
4784 Py_DECREF(u);
4785 return (PyObject *)v;
4786 }
4787
4788 /* Concat the two Unicode strings */
4789 w = _PyUnicode_New(u->length + v->length);
4790 if (w == NULL)
4791 goto onError;
4792 Py_UNICODE_COPY(w->str, u->str, u->length);
4793 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4794
4795 Py_DECREF(u);
4796 Py_DECREF(v);
4797 return (PyObject *)w;
4798
4799onError:
4800 Py_XDECREF(u);
4801 Py_XDECREF(v);
4802 return NULL;
4803}
4804
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004805PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806"S.count(sub[, start[, end]]) -> int\n\
4807\n\
4808Return the number of occurrences of substring sub in Unicode string\n\
4809S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004810interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811
4812static PyObject *
4813unicode_count(PyUnicodeObject *self, PyObject *args)
4814{
4815 PyUnicodeObject *substring;
4816 int start = 0;
4817 int end = INT_MAX;
4818 PyObject *result;
4819
Guido van Rossumb8872e62000-05-09 14:14:27 +00004820 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4821 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 return NULL;
4823
4824 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4825 (PyObject *)substring);
4826 if (substring == NULL)
4827 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004828
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 if (start < 0)
4830 start += self->length;
4831 if (start < 0)
4832 start = 0;
4833 if (end > self->length)
4834 end = self->length;
4835 if (end < 0)
4836 end += self->length;
4837 if (end < 0)
4838 end = 0;
4839
4840 result = PyInt_FromLong((long) count(self, start, end, substring));
4841
4842 Py_DECREF(substring);
4843 return result;
4844}
4845
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004846PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847"S.encode([encoding[,errors]]) -> string\n\
4848\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004849Return an encoded string version of S. Default encoding is the current\n\
4850default string encoding. errors may be given to set a different error\n\
4851handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004852a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4853'xmlcharrefreplace' as well as any other name registered with\n\
4854codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855
4856static PyObject *
4857unicode_encode(PyUnicodeObject *self, PyObject *args)
4858{
4859 char *encoding = NULL;
4860 char *errors = NULL;
4861 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4862 return NULL;
4863 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4864}
4865
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004866PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867"S.expandtabs([tabsize]) -> unicode\n\
4868\n\
4869Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004870If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871
4872static PyObject*
4873unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4874{
4875 Py_UNICODE *e;
4876 Py_UNICODE *p;
4877 Py_UNICODE *q;
4878 int i, j;
4879 PyUnicodeObject *u;
4880 int tabsize = 8;
4881
4882 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4883 return NULL;
4884
Thomas Wouters7e474022000-07-16 12:04:32 +00004885 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886 i = j = 0;
4887 e = self->str + self->length;
4888 for (p = self->str; p < e; p++)
4889 if (*p == '\t') {
4890 if (tabsize > 0)
4891 j += tabsize - (j % tabsize);
4892 }
4893 else {
4894 j++;
4895 if (*p == '\n' || *p == '\r') {
4896 i += j;
4897 j = 0;
4898 }
4899 }
4900
4901 /* Second pass: create output string and fill it */
4902 u = _PyUnicode_New(i + j);
4903 if (!u)
4904 return NULL;
4905
4906 j = 0;
4907 q = u->str;
4908
4909 for (p = self->str; p < e; p++)
4910 if (*p == '\t') {
4911 if (tabsize > 0) {
4912 i = tabsize - (j % tabsize);
4913 j += i;
4914 while (i--)
4915 *q++ = ' ';
4916 }
4917 }
4918 else {
4919 j++;
4920 *q++ = *p;
4921 if (*p == '\n' || *p == '\r')
4922 j = 0;
4923 }
4924
4925 return (PyObject*) u;
4926}
4927
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004928PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929"S.find(sub [,start [,end]]) -> int\n\
4930\n\
4931Return the lowest index in S where substring sub is found,\n\
4932such that sub is contained within s[start,end]. Optional\n\
4933arguments start and end are interpreted as in slice notation.\n\
4934\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004935Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936
4937static PyObject *
4938unicode_find(PyUnicodeObject *self, PyObject *args)
4939{
4940 PyUnicodeObject *substring;
4941 int start = 0;
4942 int end = INT_MAX;
4943 PyObject *result;
4944
Guido van Rossumb8872e62000-05-09 14:14:27 +00004945 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4946 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947 return NULL;
4948 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4949 (PyObject *)substring);
4950 if (substring == NULL)
4951 return NULL;
4952
4953 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4954
4955 Py_DECREF(substring);
4956 return result;
4957}
4958
4959static PyObject *
4960unicode_getitem(PyUnicodeObject *self, int index)
4961{
4962 if (index < 0 || index >= self->length) {
4963 PyErr_SetString(PyExc_IndexError, "string index out of range");
4964 return NULL;
4965 }
4966
4967 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4968}
4969
4970static long
4971unicode_hash(PyUnicodeObject *self)
4972{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004973 /* Since Unicode objects compare equal to their ASCII string
4974 counterparts, they should use the individual character values
4975 as basis for their hash value. This is needed to assure that
4976 strings and Unicode objects behave in the same way as
4977 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004978
Fredrik Lundhdde61642000-07-10 18:27:47 +00004979 register int len;
4980 register Py_UNICODE *p;
4981 register long x;
4982
Guido van Rossumd57fd912000-03-10 22:53:23 +00004983 if (self->hash != -1)
4984 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004985 len = PyUnicode_GET_SIZE(self);
4986 p = PyUnicode_AS_UNICODE(self);
4987 x = *p << 7;
4988 while (--len >= 0)
4989 x = (1000003*x) ^ *p++;
4990 x ^= PyUnicode_GET_SIZE(self);
4991 if (x == -1)
4992 x = -2;
4993 self->hash = x;
4994 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004995}
4996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004997PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998"S.index(sub [,start [,end]]) -> int\n\
4999\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005000Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001
5002static PyObject *
5003unicode_index(PyUnicodeObject *self, PyObject *args)
5004{
5005 int result;
5006 PyUnicodeObject *substring;
5007 int start = 0;
5008 int end = INT_MAX;
5009
Guido van Rossumb8872e62000-05-09 14:14:27 +00005010 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5011 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005013
Guido van Rossumd57fd912000-03-10 22:53:23 +00005014 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5015 (PyObject *)substring);
5016 if (substring == NULL)
5017 return NULL;
5018
5019 result = findstring(self, substring, start, end, 1);
5020
5021 Py_DECREF(substring);
5022 if (result < 0) {
5023 PyErr_SetString(PyExc_ValueError, "substring not found");
5024 return NULL;
5025 }
5026 return PyInt_FromLong(result);
5027}
5028
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005029PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005030"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005031\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005032Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005033at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005034
5035static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005036unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037{
5038 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5039 register const Py_UNICODE *e;
5040 int cased;
5041
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042 /* Shortcut for single character strings */
5043 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005044 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005045
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005046 /* Special case for empty strings */
5047 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005048 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005049
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050 e = p + PyUnicode_GET_SIZE(self);
5051 cased = 0;
5052 for (; p < e; p++) {
5053 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005054
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005056 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057 else if (!cased && Py_UNICODE_ISLOWER(ch))
5058 cased = 1;
5059 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005060 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005061}
5062
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005063PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005064"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005066Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005067at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068
5069static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005070unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071{
5072 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5073 register const Py_UNICODE *e;
5074 int cased;
5075
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076 /* Shortcut for single character strings */
5077 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005078 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005080 /* Special case for empty strings */
5081 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005082 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005083
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084 e = p + PyUnicode_GET_SIZE(self);
5085 cased = 0;
5086 for (; p < e; p++) {
5087 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005088
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005090 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 else if (!cased && Py_UNICODE_ISUPPER(ch))
5092 cased = 1;
5093 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005094 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095}
5096
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005097PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005098"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005100Return True if S is a titlecased string and there is at least one\n\
5101character in S, i.e. upper- and titlecase characters may only\n\
5102follow uncased characters and lowercase characters only cased ones.\n\
5103Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104
5105static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005106unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107{
5108 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5109 register const Py_UNICODE *e;
5110 int cased, previous_is_cased;
5111
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112 /* Shortcut for single character strings */
5113 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005114 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5115 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005117 /* Special case for empty strings */
5118 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005119 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005120
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121 e = p + PyUnicode_GET_SIZE(self);
5122 cased = 0;
5123 previous_is_cased = 0;
5124 for (; p < e; p++) {
5125 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005126
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5128 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005129 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130 previous_is_cased = 1;
5131 cased = 1;
5132 }
5133 else if (Py_UNICODE_ISLOWER(ch)) {
5134 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005135 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 previous_is_cased = 1;
5137 cased = 1;
5138 }
5139 else
5140 previous_is_cased = 0;
5141 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005142 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143}
5144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005145PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005146"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005148Return True if all characters in S are whitespace\n\
5149and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150
5151static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005152unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153{
5154 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5155 register const Py_UNICODE *e;
5156
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157 /* Shortcut for single character strings */
5158 if (PyUnicode_GET_SIZE(self) == 1 &&
5159 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005160 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005162 /* Special case for empty strings */
5163 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005164 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005165
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166 e = p + PyUnicode_GET_SIZE(self);
5167 for (; p < e; p++) {
5168 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005169 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005171 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172}
5173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005174PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005175"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005176\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005177Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005178and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005179
5180static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005181unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005182{
5183 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5184 register const Py_UNICODE *e;
5185
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005186 /* Shortcut for single character strings */
5187 if (PyUnicode_GET_SIZE(self) == 1 &&
5188 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005189 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005190
5191 /* Special case for empty strings */
5192 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005193 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005194
5195 e = p + PyUnicode_GET_SIZE(self);
5196 for (; p < e; p++) {
5197 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005198 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005199 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005200 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005201}
5202
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005203PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005204"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005205\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005206Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005207and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005208
5209static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005210unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005211{
5212 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5213 register const Py_UNICODE *e;
5214
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005215 /* Shortcut for single character strings */
5216 if (PyUnicode_GET_SIZE(self) == 1 &&
5217 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005218 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005219
5220 /* Special case for empty strings */
5221 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005222 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005223
5224 e = p + PyUnicode_GET_SIZE(self);
5225 for (; p < e; p++) {
5226 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005227 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005228 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005229 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005230}
5231
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005232PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005233"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005235Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005236False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237
5238static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005239unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240{
5241 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5242 register const Py_UNICODE *e;
5243
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244 /* Shortcut for single character strings */
5245 if (PyUnicode_GET_SIZE(self) == 1 &&
5246 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005247 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005249 /* Special case for empty strings */
5250 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005251 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005252
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253 e = p + PyUnicode_GET_SIZE(self);
5254 for (; p < e; p++) {
5255 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005256 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005258 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259}
5260
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005261PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005262"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005264Return True if all characters in S are digits\n\
5265and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266
5267static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005268unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269{
5270 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5271 register const Py_UNICODE *e;
5272
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273 /* Shortcut for single character strings */
5274 if (PyUnicode_GET_SIZE(self) == 1 &&
5275 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005276 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005278 /* Special case for empty strings */
5279 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005280 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005281
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 e = p + PyUnicode_GET_SIZE(self);
5283 for (; p < e; p++) {
5284 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005285 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005287 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288}
5289
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005290PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005291"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005293Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005294False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295
5296static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005297unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298{
5299 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5300 register const Py_UNICODE *e;
5301
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302 /* Shortcut for single character strings */
5303 if (PyUnicode_GET_SIZE(self) == 1 &&
5304 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005305 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005307 /* Special case for empty strings */
5308 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005309 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005310
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311 e = p + PyUnicode_GET_SIZE(self);
5312 for (; p < e; p++) {
5313 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005314 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005316 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317}
5318
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005319PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320"S.join(sequence) -> unicode\n\
5321\n\
5322Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005323sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324
5325static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005326unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005328 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329}
5330
5331static int
5332unicode_length(PyUnicodeObject *self)
5333{
5334 return self->length;
5335}
5336
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005337PyDoc_STRVAR(ljust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005338"S.ljust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339\n\
5340Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005341done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342
5343static PyObject *
5344unicode_ljust(PyUnicodeObject *self, PyObject *args)
5345{
5346 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005347 Py_UNICODE fillchar = ' ';
5348
5349 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350 return NULL;
5351
Tim Peters7a29bd52001-09-12 03:03:31 +00005352 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353 Py_INCREF(self);
5354 return (PyObject*) self;
5355 }
5356
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005357 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358}
5359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005360PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361"S.lower() -> unicode\n\
5362\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005363Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364
5365static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005366unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 return fixup(self, fixlower);
5369}
5370
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005371#define LEFTSTRIP 0
5372#define RIGHTSTRIP 1
5373#define BOTHSTRIP 2
5374
5375/* Arrays indexed by above */
5376static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5377
5378#define STRIPNAME(i) (stripformat[i]+3)
5379
5380static const Py_UNICODE *
5381unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5382{
Tim Peters030a5ce2002-04-22 19:00:10 +00005383 size_t i;
5384 for (i = 0; i < n; ++i)
5385 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005386 return s+i;
5387 return NULL;
5388}
5389
5390/* externally visible for str.strip(unicode) */
5391PyObject *
5392_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5393{
5394 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5395 int len = PyUnicode_GET_SIZE(self);
5396 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5397 int seplen = PyUnicode_GET_SIZE(sepobj);
5398 int i, j;
5399
5400 i = 0;
5401 if (striptype != RIGHTSTRIP) {
5402 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5403 i++;
5404 }
5405 }
5406
5407 j = len;
5408 if (striptype != LEFTSTRIP) {
5409 do {
5410 j--;
5411 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5412 j++;
5413 }
5414
5415 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5416 Py_INCREF(self);
5417 return (PyObject*)self;
5418 }
5419 else
5420 return PyUnicode_FromUnicode(s+i, j-i);
5421}
5422
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423
5424static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005425do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005427 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5428 int len = PyUnicode_GET_SIZE(self), i, j;
5429
5430 i = 0;
5431 if (striptype != RIGHTSTRIP) {
5432 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5433 i++;
5434 }
5435 }
5436
5437 j = len;
5438 if (striptype != LEFTSTRIP) {
5439 do {
5440 j--;
5441 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5442 j++;
5443 }
5444
5445 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5446 Py_INCREF(self);
5447 return (PyObject*)self;
5448 }
5449 else
5450 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451}
5452
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005453
5454static PyObject *
5455do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5456{
5457 PyObject *sep = NULL;
5458
5459 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5460 return NULL;
5461
5462 if (sep != NULL && sep != Py_None) {
5463 if (PyUnicode_Check(sep))
5464 return _PyUnicode_XStrip(self, striptype, sep);
5465 else if (PyString_Check(sep)) {
5466 PyObject *res;
5467 sep = PyUnicode_FromObject(sep);
5468 if (sep==NULL)
5469 return NULL;
5470 res = _PyUnicode_XStrip(self, striptype, sep);
5471 Py_DECREF(sep);
5472 return res;
5473 }
5474 else {
5475 PyErr_Format(PyExc_TypeError,
5476 "%s arg must be None, unicode or str",
5477 STRIPNAME(striptype));
5478 return NULL;
5479 }
5480 }
5481
5482 return do_strip(self, striptype);
5483}
5484
5485
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005486PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005487"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005488\n\
5489Return a copy of the string S with leading and trailing\n\
5490whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005491If chars is given and not None, remove characters in chars instead.\n\
5492If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005493
5494static PyObject *
5495unicode_strip(PyUnicodeObject *self, PyObject *args)
5496{
5497 if (PyTuple_GET_SIZE(args) == 0)
5498 return do_strip(self, BOTHSTRIP); /* Common case */
5499 else
5500 return do_argstrip(self, BOTHSTRIP, args);
5501}
5502
5503
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005504PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005505"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005506\n\
5507Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005508If chars is given and not None, remove characters in chars instead.\n\
5509If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005510
5511static PyObject *
5512unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5513{
5514 if (PyTuple_GET_SIZE(args) == 0)
5515 return do_strip(self, LEFTSTRIP); /* Common case */
5516 else
5517 return do_argstrip(self, LEFTSTRIP, args);
5518}
5519
5520
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005521PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005522"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005523\n\
5524Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005525If chars is given and not None, remove characters in chars instead.\n\
5526If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005527
5528static PyObject *
5529unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5530{
5531 if (PyTuple_GET_SIZE(args) == 0)
5532 return do_strip(self, RIGHTSTRIP); /* Common case */
5533 else
5534 return do_argstrip(self, RIGHTSTRIP, args);
5535}
5536
5537
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538static PyObject*
5539unicode_repeat(PyUnicodeObject *str, int len)
5540{
5541 PyUnicodeObject *u;
5542 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005543 int nchars;
5544 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545
5546 if (len < 0)
5547 len = 0;
5548
Tim Peters7a29bd52001-09-12 03:03:31 +00005549 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550 /* no repeat, return original string */
5551 Py_INCREF(str);
5552 return (PyObject*) str;
5553 }
Tim Peters8f422462000-09-09 06:13:41 +00005554
5555 /* ensure # of chars needed doesn't overflow int and # of bytes
5556 * needed doesn't overflow size_t
5557 */
5558 nchars = len * str->length;
5559 if (len && nchars / len != str->length) {
5560 PyErr_SetString(PyExc_OverflowError,
5561 "repeated string is too long");
5562 return NULL;
5563 }
5564 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5565 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5566 PyErr_SetString(PyExc_OverflowError,
5567 "repeated string is too long");
5568 return NULL;
5569 }
5570 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 if (!u)
5572 return NULL;
5573
5574 p = u->str;
5575
5576 while (len-- > 0) {
5577 Py_UNICODE_COPY(p, str->str, str->length);
5578 p += str->length;
5579 }
5580
5581 return (PyObject*) u;
5582}
5583
5584PyObject *PyUnicode_Replace(PyObject *obj,
5585 PyObject *subobj,
5586 PyObject *replobj,
5587 int maxcount)
5588{
5589 PyObject *self;
5590 PyObject *str1;
5591 PyObject *str2;
5592 PyObject *result;
5593
5594 self = PyUnicode_FromObject(obj);
5595 if (self == NULL)
5596 return NULL;
5597 str1 = PyUnicode_FromObject(subobj);
5598 if (str1 == NULL) {
5599 Py_DECREF(self);
5600 return NULL;
5601 }
5602 str2 = PyUnicode_FromObject(replobj);
5603 if (str2 == NULL) {
5604 Py_DECREF(self);
5605 Py_DECREF(str1);
5606 return NULL;
5607 }
Tim Petersced69f82003-09-16 20:30:58 +00005608 result = replace((PyUnicodeObject *)self,
5609 (PyUnicodeObject *)str1,
5610 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 maxcount);
5612 Py_DECREF(self);
5613 Py_DECREF(str1);
5614 Py_DECREF(str2);
5615 return result;
5616}
5617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005618PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619"S.replace (old, new[, maxsplit]) -> unicode\n\
5620\n\
5621Return a copy of S with all occurrences of substring\n\
5622old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005623given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624
5625static PyObject*
5626unicode_replace(PyUnicodeObject *self, PyObject *args)
5627{
5628 PyUnicodeObject *str1;
5629 PyUnicodeObject *str2;
5630 int maxcount = -1;
5631 PyObject *result;
5632
5633 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5634 return NULL;
5635 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5636 if (str1 == NULL)
5637 return NULL;
5638 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005639 if (str2 == NULL) {
5640 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005642 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643
5644 result = replace(self, str1, str2, maxcount);
5645
5646 Py_DECREF(str1);
5647 Py_DECREF(str2);
5648 return result;
5649}
5650
5651static
5652PyObject *unicode_repr(PyObject *unicode)
5653{
5654 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5655 PyUnicode_GET_SIZE(unicode),
5656 1);
5657}
5658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005659PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660"S.rfind(sub [,start [,end]]) -> int\n\
5661\n\
5662Return the highest index in S where substring sub is found,\n\
5663such that sub is contained within s[start,end]. Optional\n\
5664arguments start and end are interpreted as in slice notation.\n\
5665\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005666Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667
5668static PyObject *
5669unicode_rfind(PyUnicodeObject *self, PyObject *args)
5670{
5671 PyUnicodeObject *substring;
5672 int start = 0;
5673 int end = INT_MAX;
5674 PyObject *result;
5675
Guido van Rossumb8872e62000-05-09 14:14:27 +00005676 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5677 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 return NULL;
5679 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5680 (PyObject *)substring);
5681 if (substring == NULL)
5682 return NULL;
5683
5684 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5685
5686 Py_DECREF(substring);
5687 return result;
5688}
5689
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005690PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691"S.rindex(sub [,start [,end]]) -> int\n\
5692\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005693Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694
5695static PyObject *
5696unicode_rindex(PyUnicodeObject *self, PyObject *args)
5697{
5698 int result;
5699 PyUnicodeObject *substring;
5700 int start = 0;
5701 int end = INT_MAX;
5702
Guido van Rossumb8872e62000-05-09 14:14:27 +00005703 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5704 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 return NULL;
5706 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5707 (PyObject *)substring);
5708 if (substring == NULL)
5709 return NULL;
5710
5711 result = findstring(self, substring, start, end, -1);
5712
5713 Py_DECREF(substring);
5714 if (result < 0) {
5715 PyErr_SetString(PyExc_ValueError, "substring not found");
5716 return NULL;
5717 }
5718 return PyInt_FromLong(result);
5719}
5720
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005721PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005722"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723\n\
5724Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005725done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726
5727static PyObject *
5728unicode_rjust(PyUnicodeObject *self, PyObject *args)
5729{
5730 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005731 Py_UNICODE fillchar = ' ';
5732
5733 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 return NULL;
5735
Tim Peters7a29bd52001-09-12 03:03:31 +00005736 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 Py_INCREF(self);
5738 return (PyObject*) self;
5739 }
5740
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005741 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742}
5743
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744static PyObject*
5745unicode_slice(PyUnicodeObject *self, int start, int end)
5746{
5747 /* standard clamping */
5748 if (start < 0)
5749 start = 0;
5750 if (end < 0)
5751 end = 0;
5752 if (end > self->length)
5753 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005754 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 /* full slice, return original string */
5756 Py_INCREF(self);
5757 return (PyObject*) self;
5758 }
5759 if (start > end)
5760 start = end;
5761 /* copy slice */
5762 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5763 end - start);
5764}
5765
5766PyObject *PyUnicode_Split(PyObject *s,
5767 PyObject *sep,
5768 int maxsplit)
5769{
5770 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005771
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 s = PyUnicode_FromObject(s);
5773 if (s == NULL)
5774 return NULL;
5775 if (sep != NULL) {
5776 sep = PyUnicode_FromObject(sep);
5777 if (sep == NULL) {
5778 Py_DECREF(s);
5779 return NULL;
5780 }
5781 }
5782
5783 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5784
5785 Py_DECREF(s);
5786 Py_XDECREF(sep);
5787 return result;
5788}
5789
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005790PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791"S.split([sep [,maxsplit]]) -> list of strings\n\
5792\n\
5793Return a list of the words in S, using sep as the\n\
5794delimiter string. If maxsplit is given, at most maxsplit\n\
5795splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005796is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797
5798static PyObject*
5799unicode_split(PyUnicodeObject *self, PyObject *args)
5800{
5801 PyObject *substring = Py_None;
5802 int maxcount = -1;
5803
5804 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5805 return NULL;
5806
5807 if (substring == Py_None)
5808 return split(self, NULL, maxcount);
5809 else if (PyUnicode_Check(substring))
5810 return split(self, (PyUnicodeObject *)substring, maxcount);
5811 else
5812 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5813}
5814
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005815PyObject *PyUnicode_RSplit(PyObject *s,
5816 PyObject *sep,
5817 int maxsplit)
5818{
5819 PyObject *result;
5820
5821 s = PyUnicode_FromObject(s);
5822 if (s == NULL)
5823 return NULL;
5824 if (sep != NULL) {
5825 sep = PyUnicode_FromObject(sep);
5826 if (sep == NULL) {
5827 Py_DECREF(s);
5828 return NULL;
5829 }
5830 }
5831
5832 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5833
5834 Py_DECREF(s);
5835 Py_XDECREF(sep);
5836 return result;
5837}
5838
5839PyDoc_STRVAR(rsplit__doc__,
5840"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
5841\n\
5842Return a list of the words in S, using sep as the\n\
5843delimiter string, starting at the end of the string and\n\
5844working to the front. If maxsplit is given, at most maxsplit\n\
5845splits are done. If sep is not specified, any whitespace string\n\
5846is a separator.");
5847
5848static PyObject*
5849unicode_rsplit(PyUnicodeObject *self, PyObject *args)
5850{
5851 PyObject *substring = Py_None;
5852 int maxcount = -1;
5853
5854 if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
5855 return NULL;
5856
5857 if (substring == Py_None)
5858 return rsplit(self, NULL, maxcount);
5859 else if (PyUnicode_Check(substring))
5860 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
5861 else
5862 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
5863}
5864
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005865PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00005866"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867\n\
5868Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00005869Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005870is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871
5872static PyObject*
5873unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5874{
Guido van Rossum86662912000-04-11 15:38:46 +00005875 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876
Guido van Rossum86662912000-04-11 15:38:46 +00005877 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 return NULL;
5879
Guido van Rossum86662912000-04-11 15:38:46 +00005880 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881}
5882
5883static
5884PyObject *unicode_str(PyUnicodeObject *self)
5885{
Fred Drakee4315f52000-05-09 19:53:39 +00005886 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887}
5888
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005889PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890"S.swapcase() -> unicode\n\
5891\n\
5892Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005893and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894
5895static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005896unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 return fixup(self, fixswapcase);
5899}
5900
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005901PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902"S.translate(table) -> unicode\n\
5903\n\
5904Return a copy of the string S, where all characters have been mapped\n\
5905through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00005906Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5907Unmapped characters are left untouched. Characters mapped to None\n\
5908are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909
5910static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005911unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912{
Tim Petersced69f82003-09-16 20:30:58 +00005913 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00005915 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 "ignore");
5917}
5918
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005919PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920"S.upper() -> unicode\n\
5921\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005922Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923
5924static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005925unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 return fixup(self, fixupper);
5928}
5929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005930PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931"S.zfill(width) -> unicode\n\
5932\n\
5933Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005934of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935
5936static PyObject *
5937unicode_zfill(PyUnicodeObject *self, PyObject *args)
5938{
5939 int fill;
5940 PyUnicodeObject *u;
5941
5942 int width;
5943 if (!PyArg_ParseTuple(args, "i:zfill", &width))
5944 return NULL;
5945
5946 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00005947 if (PyUnicode_CheckExact(self)) {
5948 Py_INCREF(self);
5949 return (PyObject*) self;
5950 }
5951 else
5952 return PyUnicode_FromUnicode(
5953 PyUnicode_AS_UNICODE(self),
5954 PyUnicode_GET_SIZE(self)
5955 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 }
5957
5958 fill = width - self->length;
5959
5960 u = pad(self, fill, 0, '0');
5961
Walter Dörwald068325e2002-04-15 13:36:47 +00005962 if (u == NULL)
5963 return NULL;
5964
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 if (u->str[fill] == '+' || u->str[fill] == '-') {
5966 /* move sign to beginning of string */
5967 u->str[0] = u->str[fill];
5968 u->str[fill] = '0';
5969 }
5970
5971 return (PyObject*) u;
5972}
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973
5974#if 0
5975static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005976unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 return PyInt_FromLong(unicode_freelist_size);
5979}
5980#endif
5981
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005982PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005983"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00005985Return True if S starts with the specified prefix, False otherwise.\n\
5986With optional start, test S beginning at that position.\n\
5987With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988
5989static PyObject *
5990unicode_startswith(PyUnicodeObject *self,
5991 PyObject *args)
5992{
5993 PyUnicodeObject *substring;
5994 int start = 0;
5995 int end = INT_MAX;
5996 PyObject *result;
5997
Guido van Rossumb8872e62000-05-09 14:14:27 +00005998 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5999 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 return NULL;
6001 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6002 (PyObject *)substring);
6003 if (substring == NULL)
6004 return NULL;
6005
Guido van Rossum77f6a652002-04-03 22:41:51 +00006006 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007
6008 Py_DECREF(substring);
6009 return result;
6010}
6011
6012
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006013PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006014"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006016Return True if S ends with the specified suffix, False otherwise.\n\
6017With optional start, test S beginning at that position.\n\
6018With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019
6020static PyObject *
6021unicode_endswith(PyUnicodeObject *self,
6022 PyObject *args)
6023{
6024 PyUnicodeObject *substring;
6025 int start = 0;
6026 int end = INT_MAX;
6027 PyObject *result;
6028
Guido van Rossumb8872e62000-05-09 14:14:27 +00006029 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6030 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 return NULL;
6032 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6033 (PyObject *)substring);
6034 if (substring == NULL)
6035 return NULL;
6036
Guido van Rossum77f6a652002-04-03 22:41:51 +00006037 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038
6039 Py_DECREF(substring);
6040 return result;
6041}
6042
6043
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006044
6045static PyObject *
6046unicode_getnewargs(PyUnicodeObject *v)
6047{
6048 return Py_BuildValue("(u#)", v->str, v->length);
6049}
6050
6051
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052static PyMethodDef unicode_methods[] = {
6053
6054 /* Order is according to common usage: often used methods should
6055 appear first, since lookup is done sequentially. */
6056
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006057 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6058 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6059 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006060 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006061 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6062 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6063 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6064 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6065 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6066 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6067 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6068 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6069 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6070 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006071 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006072/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6073 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6074 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6075 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006076 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006077 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006078 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006079 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6080 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6081 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6082 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6083 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6084 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6085 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6086 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6087 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6088 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6089 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6090 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6091 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6092 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006093 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006094#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006095 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096#endif
6097
6098#if 0
6099 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006100 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101#endif
6102
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006103 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104 {NULL, NULL}
6105};
6106
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006107static PyObject *
6108unicode_mod(PyObject *v, PyObject *w)
6109{
6110 if (!PyUnicode_Check(v)) {
6111 Py_INCREF(Py_NotImplemented);
6112 return Py_NotImplemented;
6113 }
6114 return PyUnicode_Format(v, w);
6115}
6116
6117static PyNumberMethods unicode_as_number = {
6118 0, /*nb_add*/
6119 0, /*nb_subtract*/
6120 0, /*nb_multiply*/
6121 0, /*nb_divide*/
6122 unicode_mod, /*nb_remainder*/
6123};
6124
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125static PySequenceMethods unicode_as_sequence = {
6126 (inquiry) unicode_length, /* sq_length */
6127 (binaryfunc) PyUnicode_Concat, /* sq_concat */
6128 (intargfunc) unicode_repeat, /* sq_repeat */
6129 (intargfunc) unicode_getitem, /* sq_item */
6130 (intintargfunc) unicode_slice, /* sq_slice */
6131 0, /* sq_ass_item */
6132 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00006133 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134};
6135
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006136static PyObject*
6137unicode_subscript(PyUnicodeObject* self, PyObject* item)
6138{
6139 if (PyInt_Check(item)) {
6140 long i = PyInt_AS_LONG(item);
6141 if (i < 0)
6142 i += PyString_GET_SIZE(self);
6143 return unicode_getitem(self, i);
6144 } else if (PyLong_Check(item)) {
6145 long i = PyLong_AsLong(item);
6146 if (i == -1 && PyErr_Occurred())
6147 return NULL;
6148 if (i < 0)
6149 i += PyString_GET_SIZE(self);
6150 return unicode_getitem(self, i);
6151 } else if (PySlice_Check(item)) {
6152 int start, stop, step, slicelength, cur, i;
6153 Py_UNICODE* source_buf;
6154 Py_UNICODE* result_buf;
6155 PyObject* result;
6156
6157 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
6158 &start, &stop, &step, &slicelength) < 0) {
6159 return NULL;
6160 }
6161
6162 if (slicelength <= 0) {
6163 return PyUnicode_FromUnicode(NULL, 0);
6164 } else {
6165 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6166 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
6167
6168 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6169 result_buf[i] = source_buf[cur];
6170 }
Tim Petersced69f82003-09-16 20:30:58 +00006171
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006172 result = PyUnicode_FromUnicode(result_buf, slicelength);
6173 PyMem_FREE(result_buf);
6174 return result;
6175 }
6176 } else {
6177 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6178 return NULL;
6179 }
6180}
6181
6182static PyMappingMethods unicode_as_mapping = {
6183 (inquiry)unicode_length, /* mp_length */
6184 (binaryfunc)unicode_subscript, /* mp_subscript */
6185 (objobjargproc)0, /* mp_ass_subscript */
6186};
6187
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188static int
6189unicode_buffer_getreadbuf(PyUnicodeObject *self,
6190 int index,
6191 const void **ptr)
6192{
6193 if (index != 0) {
6194 PyErr_SetString(PyExc_SystemError,
6195 "accessing non-existent unicode segment");
6196 return -1;
6197 }
6198 *ptr = (void *) self->str;
6199 return PyUnicode_GET_DATA_SIZE(self);
6200}
6201
6202static int
6203unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6204 const void **ptr)
6205{
6206 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006207 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208 return -1;
6209}
6210
6211static int
6212unicode_buffer_getsegcount(PyUnicodeObject *self,
6213 int *lenp)
6214{
6215 if (lenp)
6216 *lenp = PyUnicode_GET_DATA_SIZE(self);
6217 return 1;
6218}
6219
6220static int
6221unicode_buffer_getcharbuf(PyUnicodeObject *self,
6222 int index,
6223 const void **ptr)
6224{
6225 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006226
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227 if (index != 0) {
6228 PyErr_SetString(PyExc_SystemError,
6229 "accessing non-existent unicode segment");
6230 return -1;
6231 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006232 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233 if (str == NULL)
6234 return -1;
6235 *ptr = (void *) PyString_AS_STRING(str);
6236 return PyString_GET_SIZE(str);
6237}
6238
6239/* Helpers for PyUnicode_Format() */
6240
6241static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006242getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243{
6244 int argidx = *p_argidx;
6245 if (argidx < arglen) {
6246 (*p_argidx)++;
6247 if (arglen < 0)
6248 return args;
6249 else
6250 return PyTuple_GetItem(args, argidx);
6251 }
6252 PyErr_SetString(PyExc_TypeError,
6253 "not enough arguments for format string");
6254 return NULL;
6255}
6256
6257#define F_LJUST (1<<0)
6258#define F_SIGN (1<<1)
6259#define F_BLANK (1<<2)
6260#define F_ALT (1<<3)
6261#define F_ZERO (1<<4)
6262
6263static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265{
6266 register int i;
6267 int len;
6268 va_list va;
6269 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271
6272 /* First, format the string as char array, then expand to Py_UNICODE
6273 array. */
6274 charbuffer = (char *)buffer;
6275 len = vsprintf(charbuffer, format, va);
6276 for (i = len - 1; i >= 0; i--)
6277 buffer[i] = (Py_UNICODE) charbuffer[i];
6278
6279 va_end(va);
6280 return len;
6281}
6282
Guido van Rossum078151d2002-08-11 04:24:12 +00006283/* XXX To save some code duplication, formatfloat/long/int could have been
6284 shared with stringobject.c, converting from 8-bit to Unicode after the
6285 formatting is done. */
6286
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287static int
6288formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006289 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 int flags,
6291 int prec,
6292 int type,
6293 PyObject *v)
6294{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006295 /* fmt = '%#.' + `prec` + `type`
6296 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297 char fmt[20];
6298 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006299
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 x = PyFloat_AsDouble(v);
6301 if (x == -1.0 && PyErr_Occurred())
6302 return -1;
6303 if (prec < 0)
6304 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6306 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006307 /* Worst case length calc to ensure no buffer overrun:
6308
6309 'g' formats:
6310 fmt = %#.<prec>g
6311 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6312 for any double rep.)
6313 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6314
6315 'f' formats:
6316 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6317 len = 1 + 50 + 1 + prec = 52 + prec
6318
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006319 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006320 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006321
6322 */
6323 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6324 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006325 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006326 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006327 return -1;
6328 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006329 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6330 (flags&F_ALT) ? "#" : "",
6331 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332 return usprintf(buf, fmt, x);
6333}
6334
Tim Peters38fd5b62000-09-21 05:43:11 +00006335static PyObject*
6336formatlong(PyObject *val, int flags, int prec, int type)
6337{
6338 char *buf;
6339 int i, len;
6340 PyObject *str; /* temporary string object. */
6341 PyUnicodeObject *result;
6342
6343 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6344 if (!str)
6345 return NULL;
6346 result = _PyUnicode_New(len);
6347 for (i = 0; i < len; i++)
6348 result->str[i] = buf[i];
6349 result->str[len] = 0;
6350 Py_DECREF(str);
6351 return (PyObject*)result;
6352}
6353
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354static int
6355formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006356 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 int flags,
6358 int prec,
6359 int type,
6360 PyObject *v)
6361{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006362 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006363 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6364 * + 1 + 1
6365 * = 24
6366 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006367 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006368 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369 long x;
6370
6371 x = PyInt_AsLong(v);
6372 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006373 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006374 if (x < 0 && type == 'u') {
6375 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006376 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006377 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6378 sign = "-";
6379 else
6380 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006382 prec = 1;
6383
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006384 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6385 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006386 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006387 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006388 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006389 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006390 return -1;
6391 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006392
6393 if ((flags & F_ALT) &&
6394 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006395 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006396 * of issues that cause pain:
6397 * - when 0 is being converted, the C standard leaves off
6398 * the '0x' or '0X', which is inconsistent with other
6399 * %#x/%#X conversions and inconsistent with Python's
6400 * hex() function
6401 * - there are platforms that violate the standard and
6402 * convert 0 with the '0x' or '0X'
6403 * (Metrowerks, Compaq Tru64)
6404 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006405 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006406 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006407 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006408 * We can achieve the desired consistency by inserting our
6409 * own '0x' or '0X' prefix, and substituting %x/%X in place
6410 * of %#x/%#X.
6411 *
6412 * Note that this is the same approach as used in
6413 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006414 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006415 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6416 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006417 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006418 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006419 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6420 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006421 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006422 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006423 if (sign[0])
6424 return usprintf(buf, fmt, -x);
6425 else
6426 return usprintf(buf, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427}
6428
6429static int
6430formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006431 size_t buflen,
6432 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006434 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006435 if (PyUnicode_Check(v)) {
6436 if (PyUnicode_GET_SIZE(v) != 1)
6437 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006439 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006441 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006442 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006443 goto onError;
6444 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6445 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446
6447 else {
6448 /* Integer input truncated to a character */
6449 long x;
6450 x = PyInt_AsLong(v);
6451 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006452 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006453#ifdef Py_UNICODE_WIDE
6454 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006455 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006456 "%c arg not in range(0x110000) "
6457 "(wide Python build)");
6458 return -1;
6459 }
6460#else
6461 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006462 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006463 "%c arg not in range(0x10000) "
6464 "(narrow Python build)");
6465 return -1;
6466 }
6467#endif
6468 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 }
6470 buf[1] = '\0';
6471 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006472
6473 onError:
6474 PyErr_SetString(PyExc_TypeError,
6475 "%c requires int or char");
6476 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477}
6478
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006479/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6480
6481 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6482 chars are formatted. XXX This is a magic number. Each formatting
6483 routine does bounds checking to ensure no overflow, but a better
6484 solution may be to malloc a buffer of appropriate size for each
6485 format. For now, the current solution is sufficient.
6486*/
6487#define FORMATBUFLEN (size_t)120
6488
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489PyObject *PyUnicode_Format(PyObject *format,
6490 PyObject *args)
6491{
6492 Py_UNICODE *fmt, *res;
6493 int fmtcnt, rescnt, reslen, arglen, argidx;
6494 int args_owned = 0;
6495 PyUnicodeObject *result = NULL;
6496 PyObject *dict = NULL;
6497 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006498
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 if (format == NULL || args == NULL) {
6500 PyErr_BadInternalCall();
6501 return NULL;
6502 }
6503 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006504 if (uformat == NULL)
6505 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506 fmt = PyUnicode_AS_UNICODE(uformat);
6507 fmtcnt = PyUnicode_GET_SIZE(uformat);
6508
6509 reslen = rescnt = fmtcnt + 100;
6510 result = _PyUnicode_New(reslen);
6511 if (result == NULL)
6512 goto onError;
6513 res = PyUnicode_AS_UNICODE(result);
6514
6515 if (PyTuple_Check(args)) {
6516 arglen = PyTuple_Size(args);
6517 argidx = 0;
6518 }
6519 else {
6520 arglen = -1;
6521 argidx = -2;
6522 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006523 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6524 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525 dict = args;
6526
6527 while (--fmtcnt >= 0) {
6528 if (*fmt != '%') {
6529 if (--rescnt < 0) {
6530 rescnt = fmtcnt + 100;
6531 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006532 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533 return NULL;
6534 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6535 --rescnt;
6536 }
6537 *res++ = *fmt++;
6538 }
6539 else {
6540 /* Got a format specifier */
6541 int flags = 0;
6542 int width = -1;
6543 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544 Py_UNICODE c = '\0';
6545 Py_UNICODE fill;
6546 PyObject *v = NULL;
6547 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006548 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 Py_UNICODE sign;
6550 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006551 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552
6553 fmt++;
6554 if (*fmt == '(') {
6555 Py_UNICODE *keystart;
6556 int keylen;
6557 PyObject *key;
6558 int pcount = 1;
6559
6560 if (dict == NULL) {
6561 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006562 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563 goto onError;
6564 }
6565 ++fmt;
6566 --fmtcnt;
6567 keystart = fmt;
6568 /* Skip over balanced parentheses */
6569 while (pcount > 0 && --fmtcnt >= 0) {
6570 if (*fmt == ')')
6571 --pcount;
6572 else if (*fmt == '(')
6573 ++pcount;
6574 fmt++;
6575 }
6576 keylen = fmt - keystart - 1;
6577 if (fmtcnt < 0 || pcount > 0) {
6578 PyErr_SetString(PyExc_ValueError,
6579 "incomplete format key");
6580 goto onError;
6581 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006582#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006583 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 then looked up since Python uses strings to hold
6585 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006586 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587 key = PyUnicode_EncodeUTF8(keystart,
6588 keylen,
6589 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006590#else
6591 key = PyUnicode_FromUnicode(keystart, keylen);
6592#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593 if (key == NULL)
6594 goto onError;
6595 if (args_owned) {
6596 Py_DECREF(args);
6597 args_owned = 0;
6598 }
6599 args = PyObject_GetItem(dict, key);
6600 Py_DECREF(key);
6601 if (args == NULL) {
6602 goto onError;
6603 }
6604 args_owned = 1;
6605 arglen = -1;
6606 argidx = -2;
6607 }
6608 while (--fmtcnt >= 0) {
6609 switch (c = *fmt++) {
6610 case '-': flags |= F_LJUST; continue;
6611 case '+': flags |= F_SIGN; continue;
6612 case ' ': flags |= F_BLANK; continue;
6613 case '#': flags |= F_ALT; continue;
6614 case '0': flags |= F_ZERO; continue;
6615 }
6616 break;
6617 }
6618 if (c == '*') {
6619 v = getnextarg(args, arglen, &argidx);
6620 if (v == NULL)
6621 goto onError;
6622 if (!PyInt_Check(v)) {
6623 PyErr_SetString(PyExc_TypeError,
6624 "* wants int");
6625 goto onError;
6626 }
6627 width = PyInt_AsLong(v);
6628 if (width < 0) {
6629 flags |= F_LJUST;
6630 width = -width;
6631 }
6632 if (--fmtcnt >= 0)
6633 c = *fmt++;
6634 }
6635 else if (c >= '0' && c <= '9') {
6636 width = c - '0';
6637 while (--fmtcnt >= 0) {
6638 c = *fmt++;
6639 if (c < '0' || c > '9')
6640 break;
6641 if ((width*10) / 10 != width) {
6642 PyErr_SetString(PyExc_ValueError,
6643 "width too big");
6644 goto onError;
6645 }
6646 width = width*10 + (c - '0');
6647 }
6648 }
6649 if (c == '.') {
6650 prec = 0;
6651 if (--fmtcnt >= 0)
6652 c = *fmt++;
6653 if (c == '*') {
6654 v = getnextarg(args, arglen, &argidx);
6655 if (v == NULL)
6656 goto onError;
6657 if (!PyInt_Check(v)) {
6658 PyErr_SetString(PyExc_TypeError,
6659 "* wants int");
6660 goto onError;
6661 }
6662 prec = PyInt_AsLong(v);
6663 if (prec < 0)
6664 prec = 0;
6665 if (--fmtcnt >= 0)
6666 c = *fmt++;
6667 }
6668 else if (c >= '0' && c <= '9') {
6669 prec = c - '0';
6670 while (--fmtcnt >= 0) {
6671 c = Py_CHARMASK(*fmt++);
6672 if (c < '0' || c > '9')
6673 break;
6674 if ((prec*10) / 10 != prec) {
6675 PyErr_SetString(PyExc_ValueError,
6676 "prec too big");
6677 goto onError;
6678 }
6679 prec = prec*10 + (c - '0');
6680 }
6681 }
6682 } /* prec */
6683 if (fmtcnt >= 0) {
6684 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685 if (--fmtcnt >= 0)
6686 c = *fmt++;
6687 }
6688 }
6689 if (fmtcnt < 0) {
6690 PyErr_SetString(PyExc_ValueError,
6691 "incomplete format");
6692 goto onError;
6693 }
6694 if (c != '%') {
6695 v = getnextarg(args, arglen, &argidx);
6696 if (v == NULL)
6697 goto onError;
6698 }
6699 sign = 0;
6700 fill = ' ';
6701 switch (c) {
6702
6703 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006704 pbuf = formatbuf;
6705 /* presume that buffer length is at least 1 */
6706 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707 len = 1;
6708 break;
6709
6710 case 's':
6711 case 'r':
6712 if (PyUnicode_Check(v) && c == 's') {
6713 temp = v;
6714 Py_INCREF(temp);
6715 }
6716 else {
6717 PyObject *unicode;
6718 if (c == 's')
6719 temp = PyObject_Str(v);
6720 else
6721 temp = PyObject_Repr(v);
6722 if (temp == NULL)
6723 goto onError;
6724 if (!PyString_Check(temp)) {
6725 /* XXX Note: this should never happen, since
6726 PyObject_Repr() and PyObject_Str() assure
6727 this */
6728 Py_DECREF(temp);
6729 PyErr_SetString(PyExc_TypeError,
6730 "%s argument has non-string str()");
6731 goto onError;
6732 }
Fred Drakee4315f52000-05-09 19:53:39 +00006733 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006735 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 "strict");
6737 Py_DECREF(temp);
6738 temp = unicode;
6739 if (temp == NULL)
6740 goto onError;
6741 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006742 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 len = PyUnicode_GET_SIZE(temp);
6744 if (prec >= 0 && len > prec)
6745 len = prec;
6746 break;
6747
6748 case 'i':
6749 case 'd':
6750 case 'u':
6751 case 'o':
6752 case 'x':
6753 case 'X':
6754 if (c == 'i')
6755 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006756 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006757 temp = formatlong(v, flags, prec, c);
6758 if (!temp)
6759 goto onError;
6760 pbuf = PyUnicode_AS_UNICODE(temp);
6761 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00006762 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006764 else {
6765 pbuf = formatbuf;
6766 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6767 flags, prec, c, v);
6768 if (len < 0)
6769 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006770 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006771 }
6772 if (flags & F_ZERO)
6773 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 break;
6775
6776 case 'e':
6777 case 'E':
6778 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006779 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780 case 'g':
6781 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006782 if (c == 'F')
6783 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006784 pbuf = formatbuf;
6785 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6786 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787 if (len < 0)
6788 goto onError;
6789 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006790 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791 fill = '0';
6792 break;
6793
6794 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006795 pbuf = formatbuf;
6796 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797 if (len < 0)
6798 goto onError;
6799 break;
6800
6801 default:
6802 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006803 "unsupported format character '%c' (0x%x) "
6804 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00006805 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006806 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006807 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808 goto onError;
6809 }
6810 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006811 if (*pbuf == '-' || *pbuf == '+') {
6812 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813 len--;
6814 }
6815 else if (flags & F_SIGN)
6816 sign = '+';
6817 else if (flags & F_BLANK)
6818 sign = ' ';
6819 else
6820 sign = 0;
6821 }
6822 if (width < len)
6823 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006824 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825 reslen -= rescnt;
6826 rescnt = width + fmtcnt + 100;
6827 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006828 if (reslen < 0) {
6829 Py_DECREF(result);
6830 return PyErr_NoMemory();
6831 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006832 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833 return NULL;
6834 res = PyUnicode_AS_UNICODE(result)
6835 + reslen - rescnt;
6836 }
6837 if (sign) {
6838 if (fill != ' ')
6839 *res++ = sign;
6840 rescnt--;
6841 if (width > len)
6842 width--;
6843 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006844 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6845 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006846 assert(pbuf[1] == c);
6847 if (fill != ' ') {
6848 *res++ = *pbuf++;
6849 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00006850 }
Tim Petersfff53252001-04-12 18:38:48 +00006851 rescnt -= 2;
6852 width -= 2;
6853 if (width < 0)
6854 width = 0;
6855 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00006856 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857 if (width > len && !(flags & F_LJUST)) {
6858 do {
6859 --rescnt;
6860 *res++ = fill;
6861 } while (--width > len);
6862 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006863 if (fill == ' ') {
6864 if (sign)
6865 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00006866 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006867 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006868 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00006869 *res++ = *pbuf++;
6870 *res++ = *pbuf++;
6871 }
6872 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006873 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874 res += len;
6875 rescnt -= len;
6876 while (--width >= len) {
6877 --rescnt;
6878 *res++ = ' ';
6879 }
6880 if (dict && (argidx < arglen) && c != '%') {
6881 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006882 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883 goto onError;
6884 }
6885 Py_XDECREF(temp);
6886 } /* '%' */
6887 } /* until end */
6888 if (argidx < arglen && !dict) {
6889 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006890 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891 goto onError;
6892 }
6893
6894 if (args_owned) {
6895 Py_DECREF(args);
6896 }
6897 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00006898 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006899 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900 return (PyObject *)result;
6901
6902 onError:
6903 Py_XDECREF(result);
6904 Py_DECREF(uformat);
6905 if (args_owned) {
6906 Py_DECREF(args);
6907 }
6908 return NULL;
6909}
6910
6911static PyBufferProcs unicode_as_buffer = {
6912 (getreadbufferproc) unicode_buffer_getreadbuf,
6913 (getwritebufferproc) unicode_buffer_getwritebuf,
6914 (getsegcountproc) unicode_buffer_getsegcount,
6915 (getcharbufferproc) unicode_buffer_getcharbuf,
6916};
6917
Jeremy Hylton938ace62002-07-17 16:30:39 +00006918static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00006919unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6920
Tim Peters6d6c1a32001-08-02 04:15:00 +00006921static PyObject *
6922unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6923{
6924 PyObject *x = NULL;
6925 static char *kwlist[] = {"string", "encoding", "errors", 0};
6926 char *encoding = NULL;
6927 char *errors = NULL;
6928
Guido van Rossume023fe02001-08-30 03:12:59 +00006929 if (type != &PyUnicode_Type)
6930 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00006931 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6932 kwlist, &x, &encoding, &errors))
6933 return NULL;
6934 if (x == NULL)
6935 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00006936 if (encoding == NULL && errors == NULL)
6937 return PyObject_Unicode(x);
6938 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00006939 return PyUnicode_FromEncodedObject(x, encoding, errors);
6940}
6941
Guido van Rossume023fe02001-08-30 03:12:59 +00006942static PyObject *
6943unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6944{
Tim Petersaf90b3e2001-09-12 05:18:58 +00006945 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006946 int n;
6947
6948 assert(PyType_IsSubtype(type, &PyUnicode_Type));
6949 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6950 if (tmp == NULL)
6951 return NULL;
6952 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00006953 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006954 if (pnew == NULL) {
6955 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00006956 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00006957 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006958 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6959 if (pnew->str == NULL) {
6960 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006961 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006962 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00006963 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00006964 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006965 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6966 pnew->length = n;
6967 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00006968 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00006969 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006970}
6971
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006972PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00006973"unicode(string [, encoding[, errors]]) -> object\n\
6974\n\
6975Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00006976encoding defaults to the current default string encoding.\n\
6977errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00006978
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979PyTypeObject PyUnicode_Type = {
6980 PyObject_HEAD_INIT(&PyType_Type)
6981 0, /* ob_size */
6982 "unicode", /* tp_name */
6983 sizeof(PyUnicodeObject), /* tp_size */
6984 0, /* tp_itemsize */
6985 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00006986 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006988 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989 0, /* tp_setattr */
6990 (cmpfunc) unicode_compare, /* tp_compare */
6991 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006992 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006994 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 (hashfunc) unicode_hash, /* tp_hash*/
6996 0, /* tp_call*/
6997 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006998 PyObject_GenericGetAttr, /* tp_getattro */
6999 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007001 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7002 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007003 unicode_doc, /* tp_doc */
7004 0, /* tp_traverse */
7005 0, /* tp_clear */
7006 0, /* tp_richcompare */
7007 0, /* tp_weaklistoffset */
7008 0, /* tp_iter */
7009 0, /* tp_iternext */
7010 unicode_methods, /* tp_methods */
7011 0, /* tp_members */
7012 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007013 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007014 0, /* tp_dict */
7015 0, /* tp_descr_get */
7016 0, /* tp_descr_set */
7017 0, /* tp_dictoffset */
7018 0, /* tp_init */
7019 0, /* tp_alloc */
7020 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007021 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022};
7023
7024/* Initialize the Unicode implementation */
7025
Thomas Wouters78890102000-07-22 19:25:51 +00007026void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007028 int i;
7029
Fred Drakee4315f52000-05-09 19:53:39 +00007030 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007031 unicode_freelist = NULL;
7032 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007034 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007035 for (i = 0; i < 256; i++)
7036 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007037 if (PyType_Ready(&PyUnicode_Type) < 0)
7038 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039}
7040
7041/* Finalize the Unicode implementation */
7042
7043void
Thomas Wouters78890102000-07-22 19:25:51 +00007044_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007046 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007047 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007049 Py_XDECREF(unicode_empty);
7050 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007051
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007052 for (i = 0; i < 256; i++) {
7053 if (unicode_latin1[i]) {
7054 Py_DECREF(unicode_latin1[i]);
7055 unicode_latin1[i] = NULL;
7056 }
7057 }
7058
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007059 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060 PyUnicodeObject *v = u;
7061 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007062 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007063 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007064 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007065 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007067 unicode_freelist = NULL;
7068 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007070
7071/*
7072Local variables:
7073c-basic-offset: 4
7074indent-tabs-mode: nil
7075End:
7076*/