blob: 0858525584ba1444befd613f0d7cad3e867fe571 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
157 unicode->length = length;
158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000266int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
371 for (i = size; i >= 0; i--)
372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
387 if (size > PyUnicode_GET_SIZE(unicode))
388 size = PyUnicode_GET_SIZE(unicode);
389#ifdef HAVE_USABLE_WCHAR_T
390 memcpy(w, unicode->str, size * sizeof(wchar_t));
391#else
392 {
393 register Py_UNICODE *u;
394 register int i;
395 u = PyUnicode_AS_UNICODE(unicode);
396 for (i = size; i >= 0; i--)
397 *w++ = *u++;
398 }
399#endif
400
401 return size;
402}
403
404#endif
405
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000406PyObject *PyUnicode_FromOrdinal(int ordinal)
407{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000408 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000409
410#ifdef Py_UNICODE_WIDE
411 if (ordinal < 0 || ordinal > 0x10ffff) {
412 PyErr_SetString(PyExc_ValueError,
413 "unichr() arg not in range(0x110000) "
414 "(wide Python build)");
415 return NULL;
416 }
417#else
418 if (ordinal < 0 || ordinal > 0xffff) {
419 PyErr_SetString(PyExc_ValueError,
420 "unichr() arg not in range(0x10000) "
421 "(narrow Python build)");
422 return NULL;
423 }
424#endif
425
Hye-Shik Chang40574832004-04-06 07:24:51 +0000426 s[0] = (Py_UNICODE)ordinal;
427 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000428}
429
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430PyObject *PyUnicode_FromObject(register PyObject *obj)
431{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000432 /* XXX Perhaps we should make this API an alias of
433 PyObject_Unicode() instead ?! */
434 if (PyUnicode_CheckExact(obj)) {
435 Py_INCREF(obj);
436 return obj;
437 }
438 if (PyUnicode_Check(obj)) {
439 /* For a Unicode subtype that's not a Unicode object,
440 return a true Unicode object with the same data. */
441 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
442 PyUnicode_GET_SIZE(obj));
443 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000444 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
445}
446
447PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
448 const char *encoding,
449 const char *errors)
450{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000451 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000453 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000454
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455 if (obj == NULL) {
456 PyErr_BadInternalCall();
457 return NULL;
458 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000460#if 0
461 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000462 that no encodings is given and then redirect to
463 PyObject_Unicode() which then applies the additional logic for
464 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000465
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000466 NOTE: This API should really only be used for object which
467 represent *encoded* Unicode !
468
469 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000470 if (PyUnicode_Check(obj)) {
471 if (encoding) {
472 PyErr_SetString(PyExc_TypeError,
473 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000474 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000475 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000476 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000477 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000478#else
479 if (PyUnicode_Check(obj)) {
480 PyErr_SetString(PyExc_TypeError,
481 "decoding Unicode is not supported");
482 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000484#endif
485
486 /* Coerce object */
487 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000488 s = PyString_AS_STRING(obj);
489 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000490 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
492 /* Overwrite the error message with something more useful in
493 case of a TypeError. */
494 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000495 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000496 "coercing to Unicode: need string or buffer, "
497 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000498 obj->ob_type->tp_name);
499 goto onError;
500 }
Tim Petersced69f82003-09-16 20:30:58 +0000501
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000502 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 if (len == 0) {
504 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000505 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000509
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000510 return v;
511
512 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514}
515
516PyObject *PyUnicode_Decode(const char *s,
517 int size,
518 const char *encoding,
519 const char *errors)
520{
521 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000522
523 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000524 encoding = PyUnicode_GetDefaultEncoding();
525
526 /* Shortcuts for common default encodings */
527 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000529 else if (strcmp(encoding, "latin-1") == 0)
530 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000531#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
532 else if (strcmp(encoding, "mbcs") == 0)
533 return PyUnicode_DecodeMBCS(s, size, errors);
534#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000535 else if (strcmp(encoding, "ascii") == 0)
536 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000537
538 /* Decode via the codec registry */
539 buffer = PyBuffer_FromMemory((void *)s, size);
540 if (buffer == NULL)
541 goto onError;
542 unicode = PyCodec_Decode(buffer, encoding, errors);
543 if (unicode == NULL)
544 goto onError;
545 if (!PyUnicode_Check(unicode)) {
546 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000547 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548 unicode->ob_type->tp_name);
549 Py_DECREF(unicode);
550 goto onError;
551 }
552 Py_DECREF(buffer);
553 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000554
Guido van Rossumd57fd912000-03-10 22:53:23 +0000555 onError:
556 Py_XDECREF(buffer);
557 return NULL;
558}
559
560PyObject *PyUnicode_Encode(const Py_UNICODE *s,
561 int size,
562 const char *encoding,
563 const char *errors)
564{
565 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000566
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567 unicode = PyUnicode_FromUnicode(s, size);
568 if (unicode == NULL)
569 return NULL;
570 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
571 Py_DECREF(unicode);
572 return v;
573}
574
575PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
576 const char *encoding,
577 const char *errors)
578{
579 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000580
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581 if (!PyUnicode_Check(unicode)) {
582 PyErr_BadArgument();
583 goto onError;
584 }
Fred Drakee4315f52000-05-09 19:53:39 +0000585
Tim Petersced69f82003-09-16 20:30:58 +0000586 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000587 encoding = PyUnicode_GetDefaultEncoding();
588
589 /* Shortcuts for common default encodings */
590 if (errors == NULL) {
591 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000592 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000593 else if (strcmp(encoding, "latin-1") == 0)
594 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000595#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
596 else if (strcmp(encoding, "mbcs") == 0)
597 return PyUnicode_AsMBCSString(unicode);
598#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000599 else if (strcmp(encoding, "ascii") == 0)
600 return PyUnicode_AsASCIIString(unicode);
601 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000602
603 /* Encode via the codec registry */
604 v = PyCodec_Encode(unicode, encoding, errors);
605 if (v == NULL)
606 goto onError;
607 /* XXX Should we really enforce this ? */
608 if (!PyString_Check(v)) {
609 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000610 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000611 v->ob_type->tp_name);
612 Py_DECREF(v);
613 goto onError;
614 }
615 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000616
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617 onError:
618 return NULL;
619}
620
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000621PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
622 const char *errors)
623{
624 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
625
626 if (v)
627 return v;
628 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
629 if (v && errors == NULL)
630 ((PyUnicodeObject *)unicode)->defenc = v;
631 return v;
632}
633
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
635{
636 if (!PyUnicode_Check(unicode)) {
637 PyErr_BadArgument();
638 goto onError;
639 }
640 return PyUnicode_AS_UNICODE(unicode);
641
642 onError:
643 return NULL;
644}
645
646int PyUnicode_GetSize(PyObject *unicode)
647{
648 if (!PyUnicode_Check(unicode)) {
649 PyErr_BadArgument();
650 goto onError;
651 }
652 return PyUnicode_GET_SIZE(unicode);
653
654 onError:
655 return -1;
656}
657
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000658int PyUnicode_GetWidth(PyObject *unicode)
659{
660 const Py_UNICODE *p, *e;
661 int width;
662
663 if (!PyUnicode_Check(unicode)) {
664 PyErr_BadArgument();
665 return -1;
666 }
667
668 p = PyUnicode_AS_UNICODE(unicode);
669 e = p + PyUnicode_GET_SIZE(unicode);
670 for (width = 0; p < e; p++)
671 if (Py_UNICODE_ISWIDE(*p))
672 width += 2;
673 else
674 width++;
675
676 return width;
677}
678
Thomas Wouters78890102000-07-22 19:25:51 +0000679const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000680{
681 return unicode_default_encoding;
682}
683
684int PyUnicode_SetDefaultEncoding(const char *encoding)
685{
686 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000687
Fred Drakee4315f52000-05-09 19:53:39 +0000688 /* Make sure the encoding is valid. As side effect, this also
689 loads the encoding into the codec registry cache. */
690 v = _PyCodec_Lookup(encoding);
691 if (v == NULL)
692 goto onError;
693 Py_DECREF(v);
694 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000695 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000696 sizeof(unicode_default_encoding));
697 return 0;
698
699 onError:
700 return -1;
701}
702
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000703/* error handling callback helper:
704 build arguments, call the callback and check the arguments,
705 if no exception occured, copy the replacement to the output
706 and adjust various state variables.
707 return 0 on success, -1 on error
708*/
709
710static
711int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
712 const char *encoding, const char *reason,
713 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
714 PyObject **output, int *outpos, Py_UNICODE **outptr)
715{
716 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
717
718 PyObject *restuple = NULL;
719 PyObject *repunicode = NULL;
720 int outsize = PyUnicode_GET_SIZE(*output);
721 int requiredsize;
722 int newpos;
723 Py_UNICODE *repptr;
724 int repsize;
725 int res = -1;
726
727 if (*errorHandler == NULL) {
728 *errorHandler = PyCodec_LookupError(errors);
729 if (*errorHandler == NULL)
730 goto onError;
731 }
732
733 if (*exceptionObject == NULL) {
734 *exceptionObject = PyUnicodeDecodeError_Create(
735 encoding, input, insize, *startinpos, *endinpos, reason);
736 if (*exceptionObject == NULL)
737 goto onError;
738 }
739 else {
740 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
741 goto onError;
742 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
743 goto onError;
744 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
745 goto onError;
746 }
747
748 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
749 if (restuple == NULL)
750 goto onError;
751 if (!PyTuple_Check(restuple)) {
752 PyErr_Format(PyExc_TypeError, &argparse[4]);
753 goto onError;
754 }
755 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
756 goto onError;
757 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000758 newpos = insize+newpos;
759 if (newpos<0 || newpos>insize) {
760 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
761 goto onError;
762 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000763
764 /* need more space? (at least enough for what we
765 have+the replacement+the rest of the string (starting
766 at the new input position), so we won't have to check space
767 when there are no errors in the rest of the string) */
768 repptr = PyUnicode_AS_UNICODE(repunicode);
769 repsize = PyUnicode_GET_SIZE(repunicode);
770 requiredsize = *outpos + repsize + insize-newpos;
771 if (requiredsize > outsize) {
772 if (requiredsize<2*outsize)
773 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000774 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000775 goto onError;
776 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
777 }
778 *endinpos = newpos;
779 *inptr = input + newpos;
780 Py_UNICODE_COPY(*outptr, repptr, repsize);
781 *outptr += repsize;
782 *outpos += repsize;
783 /* we made it! */
784 res = 0;
785
786 onError:
787 Py_XDECREF(restuple);
788 return res;
789}
790
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000791/* --- UTF-7 Codec -------------------------------------------------------- */
792
793/* see RFC2152 for details */
794
Tim Petersced69f82003-09-16 20:30:58 +0000795static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000796char utf7_special[128] = {
797 /* indicate whether a UTF-7 character is special i.e. cannot be directly
798 encoded:
799 0 - not special
800 1 - special
801 2 - whitespace (optional)
802 3 - RFC2152 Set O (optional) */
803 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
804 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
805 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
806 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
807 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
808 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
809 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
810 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
811
812};
813
814#define SPECIAL(c, encodeO, encodeWS) \
815 (((c)>127 || utf7_special[(c)] == 1) || \
816 (encodeWS && (utf7_special[(c)] == 2)) || \
817 (encodeO && (utf7_special[(c)] == 3)))
818
819#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
820#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
821#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
822 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
823
824#define ENCODE(out, ch, bits) \
825 while (bits >= 6) { \
826 *out++ = B64(ch >> (bits-6)); \
827 bits -= 6; \
828 }
829
830#define DECODE(out, ch, bits, surrogate) \
831 while (bits >= 16) { \
832 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
833 bits -= 16; \
834 if (surrogate) { \
835 /* We have already generated an error for the high surrogate
836 so let's not bother seeing if the low surrogate is correct or not */\
837 surrogate = 0; \
838 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
839 /* This is a surrogate pair. Unfortunately we can't represent \
840 it in a 16-bit character */ \
841 surrogate = 1; \
842 errmsg = "code pairs are not supported"; \
843 goto utf7Error; \
844 } else { \
845 *out++ = outCh; \
846 } \
847 } \
848
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000849PyObject *PyUnicode_DecodeUTF7(const char *s,
850 int size,
851 const char *errors)
852{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000853 const char *starts = s;
854 int startinpos;
855 int endinpos;
856 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000857 const char *e;
858 PyUnicodeObject *unicode;
859 Py_UNICODE *p;
860 const char *errmsg = "";
861 int inShift = 0;
862 unsigned int bitsleft = 0;
863 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000864 int surrogate = 0;
865 PyObject *errorHandler = NULL;
866 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000867
868 unicode = _PyUnicode_New(size);
869 if (!unicode)
870 return NULL;
871 if (size == 0)
872 return (PyObject *)unicode;
873
874 p = unicode->str;
875 e = s + size;
876
877 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000878 Py_UNICODE ch;
879 restart:
880 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881
882 if (inShift) {
883 if ((ch == '-') || !B64CHAR(ch)) {
884 inShift = 0;
885 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000886
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000887 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
888 if (bitsleft >= 6) {
889 /* The shift sequence has a partial character in it. If
890 bitsleft < 6 then we could just classify it as padding
891 but that is not the case here */
892
893 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000894 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000895 }
896 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000897 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000898 here so indicate the potential of a misencoded character. */
899
900 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
901 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
902 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000903 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000904 }
905
906 if (ch == '-') {
907 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000908 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000909 inShift = 1;
910 }
911 } else if (SPECIAL(ch,0,0)) {
912 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000913 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000914 } else {
915 *p++ = ch;
916 }
917 } else {
918 charsleft = (charsleft << 6) | UB64(ch);
919 bitsleft += 6;
920 s++;
921 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
922 }
923 }
924 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000925 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000926 s++;
927 if (s < e && *s == '-') {
928 s++;
929 *p++ = '+';
930 } else
931 {
932 inShift = 1;
933 bitsleft = 0;
934 }
935 }
936 else if (SPECIAL(ch,0,0)) {
937 errmsg = "unexpected special character";
938 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000939 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000940 }
941 else {
942 *p++ = ch;
943 s++;
944 }
945 continue;
946 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000947 outpos = p-PyUnicode_AS_UNICODE(unicode);
948 endinpos = s-starts;
949 if (unicode_decode_call_errorhandler(
950 errors, &errorHandler,
951 "utf7", errmsg,
952 starts, size, &startinpos, &endinpos, &exc, &s,
953 (PyObject **)&unicode, &outpos, &p))
954 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000955 }
956
957 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000958 outpos = p-PyUnicode_AS_UNICODE(unicode);
959 endinpos = size;
960 if (unicode_decode_call_errorhandler(
961 errors, &errorHandler,
962 "utf7", "unterminated shift sequence",
963 starts, size, &startinpos, &endinpos, &exc, &s,
964 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000965 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000966 if (s < e)
967 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000968 }
969
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000970 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000971 goto onError;
972
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000973 Py_XDECREF(errorHandler);
974 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000975 return (PyObject *)unicode;
976
977onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000978 Py_XDECREF(errorHandler);
979 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000980 Py_DECREF(unicode);
981 return NULL;
982}
983
984
985PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
986 int size,
987 int encodeSetO,
988 int encodeWhiteSpace,
989 const char *errors)
990{
991 PyObject *v;
992 /* It might be possible to tighten this worst case */
993 unsigned int cbAllocated = 5 * size;
994 int inShift = 0;
995 int i = 0;
996 unsigned int bitsleft = 0;
997 unsigned long charsleft = 0;
998 char * out;
999 char * start;
1000
1001 if (size == 0)
1002 return PyString_FromStringAndSize(NULL, 0);
1003
1004 v = PyString_FromStringAndSize(NULL, cbAllocated);
1005 if (v == NULL)
1006 return NULL;
1007
1008 start = out = PyString_AS_STRING(v);
1009 for (;i < size; ++i) {
1010 Py_UNICODE ch = s[i];
1011
1012 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001013 if (ch == '+') {
1014 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001015 *out++ = '-';
1016 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1017 charsleft = ch;
1018 bitsleft = 16;
1019 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001020 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001021 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001022 } else {
1023 *out++ = (char) ch;
1024 }
1025 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001026 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1027 *out++ = B64(charsleft << (6-bitsleft));
1028 charsleft = 0;
1029 bitsleft = 0;
1030 /* Characters not in the BASE64 set implicitly unshift the sequence
1031 so no '-' is required, except if the character is itself a '-' */
1032 if (B64CHAR(ch) || ch == '-') {
1033 *out++ = '-';
1034 }
1035 inShift = 0;
1036 *out++ = (char) ch;
1037 } else {
1038 bitsleft += 16;
1039 charsleft = (charsleft << 16) | ch;
1040 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1041
1042 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001043 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001044 or '-' then the shift sequence will be terminated implicitly and we
1045 don't have to insert a '-'. */
1046
1047 if (bitsleft == 0) {
1048 if (i + 1 < size) {
1049 Py_UNICODE ch2 = s[i+1];
1050
1051 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001052
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001053 } else if (B64CHAR(ch2) || ch2 == '-') {
1054 *out++ = '-';
1055 inShift = 0;
1056 } else {
1057 inShift = 0;
1058 }
1059
1060 }
1061 else {
1062 *out++ = '-';
1063 inShift = 0;
1064 }
1065 }
Tim Petersced69f82003-09-16 20:30:58 +00001066 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001067 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001068 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001069 if (bitsleft) {
1070 *out++= B64(charsleft << (6-bitsleft) );
1071 *out++ = '-';
1072 }
1073
Tim Peters5de98422002-04-27 18:44:32 +00001074 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001075 return v;
1076}
1077
1078#undef SPECIAL
1079#undef B64
1080#undef B64CHAR
1081#undef UB64
1082#undef ENCODE
1083#undef DECODE
1084
Guido van Rossumd57fd912000-03-10 22:53:23 +00001085/* --- UTF-8 Codec -------------------------------------------------------- */
1086
Tim Petersced69f82003-09-16 20:30:58 +00001087static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088char utf8_code_length[256] = {
1089 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1090 illegal prefix. see RFC 2279 for details */
1091 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1092 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1093 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1094 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1095 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1096 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1097 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1098 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1099 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1103 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1104 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1105 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1106 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1107};
1108
Guido van Rossumd57fd912000-03-10 22:53:23 +00001109PyObject *PyUnicode_DecodeUTF8(const char *s,
1110 int size,
1111 const char *errors)
1112{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001113 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001115 int startinpos;
1116 int endinpos;
1117 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001118 const char *e;
1119 PyUnicodeObject *unicode;
1120 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001121 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001122 PyObject *errorHandler = NULL;
1123 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001124
1125 /* Note: size will always be longer than the resulting Unicode
1126 character count */
1127 unicode = _PyUnicode_New(size);
1128 if (!unicode)
1129 return NULL;
1130 if (size == 0)
1131 return (PyObject *)unicode;
1132
1133 /* Unpack UTF-8 encoded data */
1134 p = unicode->str;
1135 e = s + size;
1136
1137 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001138 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001139
1140 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001141 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001142 s++;
1143 continue;
1144 }
1145
1146 n = utf8_code_length[ch];
1147
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001148 if (s + n > e) {
1149 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001150 startinpos = s-starts;
1151 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001152 goto utf8Error;
1153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154
1155 switch (n) {
1156
1157 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001158 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001159 startinpos = s-starts;
1160 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001161 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162
1163 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001164 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001165 startinpos = s-starts;
1166 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001167 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168
1169 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001170 if ((s[1] & 0xc0) != 0x80) {
1171 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001172 startinpos = s-starts;
1173 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001174 goto utf8Error;
1175 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001177 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001178 startinpos = s-starts;
1179 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001180 errmsg = "illegal encoding";
1181 goto utf8Error;
1182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001184 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185 break;
1186
1187 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001188 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001189 (s[2] & 0xc0) != 0x80) {
1190 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001191 startinpos = s-starts;
1192 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001193 goto utf8Error;
1194 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001195 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001196 if (ch < 0x0800) {
1197 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001198 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001199
1200 XXX For wide builds (UCS-4) we should probably try
1201 to recombine the surrogates into a single code
1202 unit.
1203 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001204 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001205 startinpos = s-starts;
1206 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001207 goto utf8Error;
1208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001210 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001211 break;
1212
1213 case 4:
1214 if ((s[1] & 0xc0) != 0x80 ||
1215 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001216 (s[3] & 0xc0) != 0x80) {
1217 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001218 startinpos = s-starts;
1219 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001220 goto utf8Error;
1221 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001222 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1223 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1224 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001225 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001226 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001227 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001228 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001229 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001230 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001231 startinpos = s-starts;
1232 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001233 goto utf8Error;
1234 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001235#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001236 *p++ = (Py_UNICODE)ch;
1237#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001238 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001239
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001240 /* translate from 10000..10FFFF to 0..FFFF */
1241 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001242
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001243 /* high surrogate = top 10 bits added to D800 */
1244 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001245
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001246 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001247 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001248#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001249 break;
1250
1251 default:
1252 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001253 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001254 startinpos = s-starts;
1255 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001256 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001257 }
1258 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001259 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001260
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001261 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001262 outpos = p-PyUnicode_AS_UNICODE(unicode);
1263 if (unicode_decode_call_errorhandler(
1264 errors, &errorHandler,
1265 "utf8", errmsg,
1266 starts, size, &startinpos, &endinpos, &exc, &s,
1267 (PyObject **)&unicode, &outpos, &p))
1268 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269 }
1270
1271 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001272 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 goto onError;
1274
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001275 Py_XDECREF(errorHandler);
1276 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277 return (PyObject *)unicode;
1278
1279onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 Py_XDECREF(errorHandler);
1281 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001282 Py_DECREF(unicode);
1283 return NULL;
1284}
1285
Tim Peters602f7402002-04-27 18:03:26 +00001286/* Allocation strategy: if the string is short, convert into a stack buffer
1287 and allocate exactly as much space needed at the end. Else allocate the
1288 maximum possible needed (4 result bytes per Unicode character), and return
1289 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001290*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001291PyObject *
1292PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1293 int size,
1294 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001295{
Tim Peters602f7402002-04-27 18:03:26 +00001296#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001297
Tim Peters602f7402002-04-27 18:03:26 +00001298 int i; /* index into s of next input byte */
1299 PyObject *v; /* result string object */
1300 char *p; /* next free byte in output buffer */
1301 int nallocated; /* number of result bytes allocated */
1302 int nneeded; /* number of result bytes needed */
1303 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001304
Tim Peters602f7402002-04-27 18:03:26 +00001305 assert(s != NULL);
1306 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307
Tim Peters602f7402002-04-27 18:03:26 +00001308 if (size <= MAX_SHORT_UNICHARS) {
1309 /* Write into the stack buffer; nallocated can't overflow.
1310 * At the end, we'll allocate exactly as much heap space as it
1311 * turns out we need.
1312 */
1313 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1314 v = NULL; /* will allocate after we're done */
1315 p = stackbuf;
1316 }
1317 else {
1318 /* Overallocate on the heap, and give the excess back at the end. */
1319 nallocated = size * 4;
1320 if (nallocated / 4 != size) /* overflow! */
1321 return PyErr_NoMemory();
1322 v = PyString_FromStringAndSize(NULL, nallocated);
1323 if (v == NULL)
1324 return NULL;
1325 p = PyString_AS_STRING(v);
1326 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001327
Tim Peters602f7402002-04-27 18:03:26 +00001328 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001329 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001330
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001331 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001332 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001333 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001334
Guido van Rossumd57fd912000-03-10 22:53:23 +00001335 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001336 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001337 *p++ = (char)(0xc0 | (ch >> 6));
1338 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001339 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001340 else {
Tim Peters602f7402002-04-27 18:03:26 +00001341 /* Encode UCS2 Unicode ordinals */
1342 if (ch < 0x10000) {
1343 /* Special case: check for high surrogate */
1344 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1345 Py_UCS4 ch2 = s[i];
1346 /* Check for low surrogate and combine the two to
1347 form a UCS4 value */
1348 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001349 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001350 i++;
1351 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001352 }
Tim Peters602f7402002-04-27 18:03:26 +00001353 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001354 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001355 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001356 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1357 *p++ = (char)(0x80 | (ch & 0x3f));
1358 continue;
1359 }
1360encodeUCS4:
1361 /* Encode UCS4 Unicode ordinals */
1362 *p++ = (char)(0xf0 | (ch >> 18));
1363 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1364 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1365 *p++ = (char)(0x80 | (ch & 0x3f));
1366 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001368
Tim Peters602f7402002-04-27 18:03:26 +00001369 if (v == NULL) {
1370 /* This was stack allocated. */
1371 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1372 assert(nneeded <= nallocated);
1373 v = PyString_FromStringAndSize(stackbuf, nneeded);
1374 }
1375 else {
1376 /* Cut back to size actually needed. */
1377 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1378 assert(nneeded <= nallocated);
1379 _PyString_Resize(&v, nneeded);
1380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001382
Tim Peters602f7402002-04-27 18:03:26 +00001383#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001384}
1385
Guido van Rossumd57fd912000-03-10 22:53:23 +00001386PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1387{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001388 if (!PyUnicode_Check(unicode)) {
1389 PyErr_BadArgument();
1390 return NULL;
1391 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001392 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1393 PyUnicode_GET_SIZE(unicode),
1394 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001395}
1396
1397/* --- UTF-16 Codec ------------------------------------------------------- */
1398
Tim Peters772747b2001-08-09 22:21:55 +00001399PyObject *
1400PyUnicode_DecodeUTF16(const char *s,
1401 int size,
1402 const char *errors,
1403 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001405 const char *starts = s;
1406 int startinpos;
1407 int endinpos;
1408 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409 PyUnicodeObject *unicode;
1410 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001411 const unsigned char *q, *e;
1412 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001413 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001414 /* Offsets from q for retrieving byte pairs in the right order. */
1415#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1416 int ihi = 1, ilo = 0;
1417#else
1418 int ihi = 0, ilo = 1;
1419#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420 PyObject *errorHandler = NULL;
1421 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001422
1423 /* Note: size will always be longer than the resulting Unicode
1424 character count */
1425 unicode = _PyUnicode_New(size);
1426 if (!unicode)
1427 return NULL;
1428 if (size == 0)
1429 return (PyObject *)unicode;
1430
1431 /* Unpack UTF-16 encoded data */
1432 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001433 q = (unsigned char *)s;
1434 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435
1436 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001437 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001438
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001439 /* Check for BOM marks (U+FEFF) in the input and adjust current
1440 byte order setting accordingly. In native mode, the leading BOM
1441 mark is skipped, in all other modes, it is copied to the output
1442 stream as-is (giving a ZWNBSP character). */
1443 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001444 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001445#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001446 if (bom == 0xFEFF) {
1447 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001448 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001449 }
1450 else if (bom == 0xFFFE) {
1451 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001452 bo = 1;
1453 }
Tim Petersced69f82003-09-16 20:30:58 +00001454#else
Tim Peters772747b2001-08-09 22:21:55 +00001455 if (bom == 0xFEFF) {
1456 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001457 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001458 }
1459 else if (bom == 0xFFFE) {
1460 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001461 bo = -1;
1462 }
1463#endif
1464 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001465
Tim Peters772747b2001-08-09 22:21:55 +00001466 if (bo == -1) {
1467 /* force LE */
1468 ihi = 1;
1469 ilo = 0;
1470 }
1471 else if (bo == 1) {
1472 /* force BE */
1473 ihi = 0;
1474 ilo = 1;
1475 }
1476
1477 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001478 Py_UNICODE ch;
1479 /* remaing bytes at the end? (size should be even) */
1480 if (e-q<2) {
1481 errmsg = "truncated data";
1482 startinpos = ((const char *)q)-starts;
1483 endinpos = ((const char *)e)-starts;
1484 goto utf16Error;
1485 /* The remaining input chars are ignored if the callback
1486 chooses to skip the input */
1487 }
1488 ch = (q[ihi] << 8) | q[ilo];
1489
Tim Peters772747b2001-08-09 22:21:55 +00001490 q += 2;
1491
Guido van Rossumd57fd912000-03-10 22:53:23 +00001492 if (ch < 0xD800 || ch > 0xDFFF) {
1493 *p++ = ch;
1494 continue;
1495 }
1496
1497 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001498 if (q >= e) {
1499 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001500 startinpos = (((const char *)q)-2)-starts;
1501 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001502 goto utf16Error;
1503 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001504 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001505 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1506 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001507 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001508#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001509 *p++ = ch;
1510 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001511#else
1512 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001513#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001514 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001515 }
1516 else {
1517 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001518 startinpos = (((const char *)q)-4)-starts;
1519 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001520 goto utf16Error;
1521 }
1522
Guido van Rossumd57fd912000-03-10 22:53:23 +00001523 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001524 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 startinpos = (((const char *)q)-2)-starts;
1526 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001527 /* Fall through to report the error */
1528
1529 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001530 outpos = p-PyUnicode_AS_UNICODE(unicode);
1531 if (unicode_decode_call_errorhandler(
1532 errors, &errorHandler,
1533 "utf16", errmsg,
1534 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1535 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001536 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001537 }
1538
1539 if (byteorder)
1540 *byteorder = bo;
1541
1542 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001543 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001544 goto onError;
1545
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001546 Py_XDECREF(errorHandler);
1547 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001548 return (PyObject *)unicode;
1549
1550onError:
1551 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001552 Py_XDECREF(errorHandler);
1553 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001554 return NULL;
1555}
1556
Tim Peters772747b2001-08-09 22:21:55 +00001557PyObject *
1558PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1559 int size,
1560 const char *errors,
1561 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562{
1563 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001564 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001565#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001566 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001567#else
1568 const int pairs = 0;
1569#endif
Tim Peters772747b2001-08-09 22:21:55 +00001570 /* Offsets from p for storing byte pairs in the right order. */
1571#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1572 int ihi = 1, ilo = 0;
1573#else
1574 int ihi = 0, ilo = 1;
1575#endif
1576
1577#define STORECHAR(CH) \
1578 do { \
1579 p[ihi] = ((CH) >> 8) & 0xff; \
1580 p[ilo] = (CH) & 0xff; \
1581 p += 2; \
1582 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001583
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001584#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001585 for (i = pairs = 0; i < size; i++)
1586 if (s[i] >= 0x10000)
1587 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001588#endif
Tim Petersced69f82003-09-16 20:30:58 +00001589 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001590 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001591 if (v == NULL)
1592 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593
Tim Peters772747b2001-08-09 22:21:55 +00001594 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001595 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001596 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001597 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001598 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001599
1600 if (byteorder == -1) {
1601 /* force LE */
1602 ihi = 1;
1603 ilo = 0;
1604 }
1605 else if (byteorder == 1) {
1606 /* force BE */
1607 ihi = 0;
1608 ilo = 1;
1609 }
1610
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001611 while (size-- > 0) {
1612 Py_UNICODE ch = *s++;
1613 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001614#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001615 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001616 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1617 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001619#endif
Tim Peters772747b2001-08-09 22:21:55 +00001620 STORECHAR(ch);
1621 if (ch2)
1622 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001623 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001624 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001625#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001626}
1627
1628PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1629{
1630 if (!PyUnicode_Check(unicode)) {
1631 PyErr_BadArgument();
1632 return NULL;
1633 }
1634 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1635 PyUnicode_GET_SIZE(unicode),
1636 NULL,
1637 0);
1638}
1639
1640/* --- Unicode Escape Codec ----------------------------------------------- */
1641
Fredrik Lundh06d12682001-01-24 07:59:11 +00001642static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001643
Guido van Rossumd57fd912000-03-10 22:53:23 +00001644PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1645 int size,
1646 const char *errors)
1647{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001648 const char *starts = s;
1649 int startinpos;
1650 int endinpos;
1651 int outpos;
1652 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001653 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001654 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001655 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001656 char* message;
1657 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001658 PyObject *errorHandler = NULL;
1659 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001660
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 /* Escaped strings will always be longer than the resulting
1662 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001663 length after conversion to the true value.
1664 (but if the error callback returns a long replacement string
1665 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001666 v = _PyUnicode_New(size);
1667 if (v == NULL)
1668 goto onError;
1669 if (size == 0)
1670 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001671
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001672 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001674
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675 while (s < end) {
1676 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001677 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001678 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679
1680 /* Non-escape characters are interpreted as Unicode ordinals */
1681 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001682 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001683 continue;
1684 }
1685
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001686 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687 /* \ - Escapes */
1688 s++;
1689 switch (*s++) {
1690
1691 /* \x escapes */
1692 case '\n': break;
1693 case '\\': *p++ = '\\'; break;
1694 case '\'': *p++ = '\''; break;
1695 case '\"': *p++ = '\"'; break;
1696 case 'b': *p++ = '\b'; break;
1697 case 'f': *p++ = '\014'; break; /* FF */
1698 case 't': *p++ = '\t'; break;
1699 case 'n': *p++ = '\n'; break;
1700 case 'r': *p++ = '\r'; break;
1701 case 'v': *p++ = '\013'; break; /* VT */
1702 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1703
1704 /* \OOO (octal) escapes */
1705 case '0': case '1': case '2': case '3':
1706 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001707 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001709 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001711 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001712 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001713 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714 break;
1715
Fredrik Lundhccc74732001-02-18 22:13:49 +00001716 /* hex escapes */
1717 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001719 digits = 2;
1720 message = "truncated \\xXX escape";
1721 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722
Fredrik Lundhccc74732001-02-18 22:13:49 +00001723 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001725 digits = 4;
1726 message = "truncated \\uXXXX escape";
1727 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001728
Fredrik Lundhccc74732001-02-18 22:13:49 +00001729 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001730 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001731 digits = 8;
1732 message = "truncated \\UXXXXXXXX escape";
1733 hexescape:
1734 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001735 outpos = p-PyUnicode_AS_UNICODE(v);
1736 if (s+digits>end) {
1737 endinpos = size;
1738 if (unicode_decode_call_errorhandler(
1739 errors, &errorHandler,
1740 "unicodeescape", "end of string in escape sequence",
1741 starts, size, &startinpos, &endinpos, &exc, &s,
1742 (PyObject **)&v, &outpos, &p))
1743 goto onError;
1744 goto nextByte;
1745 }
1746 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001747 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001748 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001749 endinpos = (s+i+1)-starts;
1750 if (unicode_decode_call_errorhandler(
1751 errors, &errorHandler,
1752 "unicodeescape", message,
1753 starts, size, &startinpos, &endinpos, &exc, &s,
1754 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001755 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001756 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001757 }
1758 chr = (chr<<4) & ~0xF;
1759 if (c >= '0' && c <= '9')
1760 chr += c - '0';
1761 else if (c >= 'a' && c <= 'f')
1762 chr += 10 + c - 'a';
1763 else
1764 chr += 10 + c - 'A';
1765 }
1766 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001767 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001768 /* _decoding_error will have already written into the
1769 target buffer. */
1770 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001771 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001772 /* when we get here, chr is a 32-bit unicode character */
1773 if (chr <= 0xffff)
1774 /* UCS-2 character */
1775 *p++ = (Py_UNICODE) chr;
1776 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001777 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001778 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001779#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001780 *p++ = chr;
1781#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001782 chr -= 0x10000L;
1783 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001784 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001785#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001786 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001787 endinpos = s-starts;
1788 outpos = p-PyUnicode_AS_UNICODE(v);
1789 if (unicode_decode_call_errorhandler(
1790 errors, &errorHandler,
1791 "unicodeescape", "illegal Unicode character",
1792 starts, size, &startinpos, &endinpos, &exc, &s,
1793 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001794 goto onError;
1795 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001796 break;
1797
1798 /* \N{name} */
1799 case 'N':
1800 message = "malformed \\N character escape";
1801 if (ucnhash_CAPI == NULL) {
1802 /* load the unicode data module */
1803 PyObject *m, *v;
1804 m = PyImport_ImportModule("unicodedata");
1805 if (m == NULL)
1806 goto ucnhashError;
1807 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1808 Py_DECREF(m);
1809 if (v == NULL)
1810 goto ucnhashError;
1811 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1812 Py_DECREF(v);
1813 if (ucnhash_CAPI == NULL)
1814 goto ucnhashError;
1815 }
1816 if (*s == '{') {
1817 const char *start = s+1;
1818 /* look for the closing brace */
1819 while (*s != '}' && s < end)
1820 s++;
1821 if (s > start && s < end && *s == '}') {
1822 /* found a name. look it up in the unicode database */
1823 message = "unknown Unicode character name";
1824 s++;
1825 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1826 goto store;
1827 }
1828 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001829 endinpos = s-starts;
1830 outpos = p-PyUnicode_AS_UNICODE(v);
1831 if (unicode_decode_call_errorhandler(
1832 errors, &errorHandler,
1833 "unicodeescape", message,
1834 starts, size, &startinpos, &endinpos, &exc, &s,
1835 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001836 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001837 break;
1838
1839 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001840 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001841 message = "\\ at end of string";
1842 s--;
1843 endinpos = s-starts;
1844 outpos = p-PyUnicode_AS_UNICODE(v);
1845 if (unicode_decode_call_errorhandler(
1846 errors, &errorHandler,
1847 "unicodeescape", message,
1848 starts, size, &startinpos, &endinpos, &exc, &s,
1849 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001850 goto onError;
1851 }
1852 else {
1853 *p++ = '\\';
1854 *p++ = (unsigned char)s[-1];
1855 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001856 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001857 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 nextByte:
1859 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001861 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001863 Py_XDECREF(errorHandler);
1864 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001866
Fredrik Lundhccc74732001-02-18 22:13:49 +00001867ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001868 PyErr_SetString(
1869 PyExc_UnicodeError,
1870 "\\N escapes not supported (can't load unicodedata module)"
1871 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001872 Py_XDECREF(errorHandler);
1873 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001874 return NULL;
1875
Fredrik Lundhccc74732001-02-18 22:13:49 +00001876onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001877 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001878 Py_XDECREF(errorHandler);
1879 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880 return NULL;
1881}
1882
1883/* Return a Unicode-Escape string version of the Unicode object.
1884
1885 If quotes is true, the string is enclosed in u"" or u'' quotes as
1886 appropriate.
1887
1888*/
1889
Barry Warsaw51ac5802000-03-20 16:36:48 +00001890static const Py_UNICODE *findchar(const Py_UNICODE *s,
1891 int size,
1892 Py_UNICODE ch);
1893
Guido van Rossumd57fd912000-03-10 22:53:23 +00001894static
1895PyObject *unicodeescape_string(const Py_UNICODE *s,
1896 int size,
1897 int quotes)
1898{
1899 PyObject *repr;
1900 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001901
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001902 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001903
1904 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1905 if (repr == NULL)
1906 return NULL;
1907
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001908 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001909
1910 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001911 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001912 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913 !findchar(s, size, '"')) ? '"' : '\'';
1914 }
1915 while (size-- > 0) {
1916 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001917
Guido van Rossumd57fd912000-03-10 22:53:23 +00001918 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001919 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001920 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001921 *p++ = '\\';
1922 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001923 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001924 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001925
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001926#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001927 /* Map 21-bit characters to '\U00xxxxxx' */
1928 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001929 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00001930
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001931 /* Resize the string if necessary */
1932 if (offset + 12 > PyString_GET_SIZE(repr)) {
1933 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001934 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001935 p = PyString_AS_STRING(repr) + offset;
1936 }
1937
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001938 *p++ = '\\';
1939 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001940 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1941 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1942 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1943 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1944 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1945 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1946 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001947 *p++ = hexdigit[ch & 0x0000000F];
1948 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001949 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001950#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001951 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1952 else if (ch >= 0xD800 && ch < 0xDC00) {
1953 Py_UNICODE ch2;
1954 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00001955
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001956 ch2 = *s++;
1957 size--;
1958 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1959 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1960 *p++ = '\\';
1961 *p++ = 'U';
1962 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1963 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1964 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1965 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1966 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1967 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1968 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1969 *p++ = hexdigit[ucs & 0x0000000F];
1970 continue;
1971 }
1972 /* Fall through: isolated surrogates are copied as-is */
1973 s--;
1974 size++;
1975 }
1976
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001978 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 *p++ = '\\';
1980 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001981 *p++ = hexdigit[(ch >> 12) & 0x000F];
1982 *p++ = hexdigit[(ch >> 8) & 0x000F];
1983 *p++ = hexdigit[(ch >> 4) & 0x000F];
1984 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001986
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001987 /* Map special whitespace to '\t', \n', '\r' */
1988 else if (ch == '\t') {
1989 *p++ = '\\';
1990 *p++ = 't';
1991 }
1992 else if (ch == '\n') {
1993 *p++ = '\\';
1994 *p++ = 'n';
1995 }
1996 else if (ch == '\r') {
1997 *p++ = '\\';
1998 *p++ = 'r';
1999 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002000
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002001 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002002 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002003 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002004 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002005 *p++ = hexdigit[(ch >> 4) & 0x000F];
2006 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002007 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002008
Guido van Rossumd57fd912000-03-10 22:53:23 +00002009 /* Copy everything else as-is */
2010 else
2011 *p++ = (char) ch;
2012 }
2013 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002014 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002015
2016 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002017 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002018 return repr;
2019}
2020
2021PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2022 int size)
2023{
2024 return unicodeescape_string(s, size, 0);
2025}
2026
2027PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2028{
2029 if (!PyUnicode_Check(unicode)) {
2030 PyErr_BadArgument();
2031 return NULL;
2032 }
2033 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2034 PyUnicode_GET_SIZE(unicode));
2035}
2036
2037/* --- Raw Unicode Escape Codec ------------------------------------------- */
2038
2039PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2040 int size,
2041 const char *errors)
2042{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002043 const char *starts = s;
2044 int startinpos;
2045 int endinpos;
2046 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002048 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049 const char *end;
2050 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002051 PyObject *errorHandler = NULL;
2052 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002053
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 /* Escaped strings will always be longer than the resulting
2055 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002056 length after conversion to the true value. (But decoding error
2057 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 v = _PyUnicode_New(size);
2059 if (v == NULL)
2060 goto onError;
2061 if (size == 0)
2062 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002063 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064 end = s + size;
2065 while (s < end) {
2066 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002067 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002069 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070
2071 /* Non-escape characters are interpreted as Unicode ordinals */
2072 if (*s != '\\') {
2073 *p++ = (unsigned char)*s++;
2074 continue;
2075 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002076 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002077
2078 /* \u-escapes are only interpreted iff the number of leading
2079 backslashes if odd */
2080 bs = s;
2081 for (;s < end;) {
2082 if (*s != '\\')
2083 break;
2084 *p++ = (unsigned char)*s++;
2085 }
2086 if (((s - bs) & 1) == 0 ||
2087 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002088 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002089 continue;
2090 }
2091 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002092 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093 s++;
2094
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002095 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002096 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002097 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002098 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002100 endinpos = s-starts;
2101 if (unicode_decode_call_errorhandler(
2102 errors, &errorHandler,
2103 "rawunicodeescape", "truncated \\uXXXX",
2104 starts, size, &startinpos, &endinpos, &exc, &s,
2105 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002106 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002107 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108 }
2109 x = (x<<4) & ~0xF;
2110 if (c >= '0' && c <= '9')
2111 x += c - '0';
2112 else if (c >= 'a' && c <= 'f')
2113 x += 10 + c - 'a';
2114 else
2115 x += 10 + c - 'A';
2116 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002117#ifndef Py_UNICODE_WIDE
2118 if (x > 0x10000) {
2119 if (unicode_decode_call_errorhandler(
2120 errors, &errorHandler,
2121 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2122 starts, size, &startinpos, &endinpos, &exc, &s,
2123 (PyObject **)&v, &outpos, &p))
2124 goto onError;
2125 }
2126#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002127 *p++ = x;
2128 nextByte:
2129 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002131 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002132 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002133 Py_XDECREF(errorHandler);
2134 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002136
Guido van Rossumd57fd912000-03-10 22:53:23 +00002137 onError:
2138 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002139 Py_XDECREF(errorHandler);
2140 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002141 return NULL;
2142}
2143
2144PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2145 int size)
2146{
2147 PyObject *repr;
2148 char *p;
2149 char *q;
2150
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002151 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002153#ifdef Py_UNICODE_WIDE
2154 repr = PyString_FromStringAndSize(NULL, 10 * size);
2155#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002157#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002158 if (repr == NULL)
2159 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002160 if (size == 0)
2161 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162
2163 p = q = PyString_AS_STRING(repr);
2164 while (size-- > 0) {
2165 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002166#ifdef Py_UNICODE_WIDE
2167 /* Map 32-bit characters to '\Uxxxxxxxx' */
2168 if (ch >= 0x10000) {
2169 *p++ = '\\';
2170 *p++ = 'U';
2171 *p++ = hexdigit[(ch >> 28) & 0xf];
2172 *p++ = hexdigit[(ch >> 24) & 0xf];
2173 *p++ = hexdigit[(ch >> 20) & 0xf];
2174 *p++ = hexdigit[(ch >> 16) & 0xf];
2175 *p++ = hexdigit[(ch >> 12) & 0xf];
2176 *p++ = hexdigit[(ch >> 8) & 0xf];
2177 *p++ = hexdigit[(ch >> 4) & 0xf];
2178 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002179 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002180 else
2181#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 /* Map 16-bit characters to '\uxxxx' */
2183 if (ch >= 256) {
2184 *p++ = '\\';
2185 *p++ = 'u';
2186 *p++ = hexdigit[(ch >> 12) & 0xf];
2187 *p++ = hexdigit[(ch >> 8) & 0xf];
2188 *p++ = hexdigit[(ch >> 4) & 0xf];
2189 *p++ = hexdigit[ch & 15];
2190 }
2191 /* Copy everything else as-is */
2192 else
2193 *p++ = (char) ch;
2194 }
2195 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002196 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 return repr;
2198}
2199
2200PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2201{
2202 if (!PyUnicode_Check(unicode)) {
2203 PyErr_BadArgument();
2204 return NULL;
2205 }
2206 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2207 PyUnicode_GET_SIZE(unicode));
2208}
2209
2210/* --- Latin-1 Codec ------------------------------------------------------ */
2211
2212PyObject *PyUnicode_DecodeLatin1(const char *s,
2213 int size,
2214 const char *errors)
2215{
2216 PyUnicodeObject *v;
2217 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002218
Guido van Rossumd57fd912000-03-10 22:53:23 +00002219 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002220 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002221 Py_UNICODE r = *(unsigned char*)s;
2222 return PyUnicode_FromUnicode(&r, 1);
2223 }
2224
Guido van Rossumd57fd912000-03-10 22:53:23 +00002225 v = _PyUnicode_New(size);
2226 if (v == NULL)
2227 goto onError;
2228 if (size == 0)
2229 return (PyObject *)v;
2230 p = PyUnicode_AS_UNICODE(v);
2231 while (size-- > 0)
2232 *p++ = (unsigned char)*s++;
2233 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002234
Guido van Rossumd57fd912000-03-10 22:53:23 +00002235 onError:
2236 Py_XDECREF(v);
2237 return NULL;
2238}
2239
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002240/* create or adjust a UnicodeEncodeError */
2241static void make_encode_exception(PyObject **exceptionObject,
2242 const char *encoding,
2243 const Py_UNICODE *unicode, int size,
2244 int startpos, int endpos,
2245 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002246{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002247 if (*exceptionObject == NULL) {
2248 *exceptionObject = PyUnicodeEncodeError_Create(
2249 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250 }
2251 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002252 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2253 goto onError;
2254 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2255 goto onError;
2256 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2257 goto onError;
2258 return;
2259 onError:
2260 Py_DECREF(*exceptionObject);
2261 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262 }
2263}
2264
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002265/* raises a UnicodeEncodeError */
2266static void raise_encode_exception(PyObject **exceptionObject,
2267 const char *encoding,
2268 const Py_UNICODE *unicode, int size,
2269 int startpos, int endpos,
2270 const char *reason)
2271{
2272 make_encode_exception(exceptionObject,
2273 encoding, unicode, size, startpos, endpos, reason);
2274 if (*exceptionObject != NULL)
2275 PyCodec_StrictErrors(*exceptionObject);
2276}
2277
2278/* error handling callback helper:
2279 build arguments, call the callback and check the arguments,
2280 put the result into newpos and return the replacement string, which
2281 has to be freed by the caller */
2282static PyObject *unicode_encode_call_errorhandler(const char *errors,
2283 PyObject **errorHandler,
2284 const char *encoding, const char *reason,
2285 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2286 int startpos, int endpos,
2287 int *newpos)
2288{
2289 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2290
2291 PyObject *restuple;
2292 PyObject *resunicode;
2293
2294 if (*errorHandler == NULL) {
2295 *errorHandler = PyCodec_LookupError(errors);
2296 if (*errorHandler == NULL)
2297 return NULL;
2298 }
2299
2300 make_encode_exception(exceptionObject,
2301 encoding, unicode, size, startpos, endpos, reason);
2302 if (*exceptionObject == NULL)
2303 return NULL;
2304
2305 restuple = PyObject_CallFunctionObjArgs(
2306 *errorHandler, *exceptionObject, NULL);
2307 if (restuple == NULL)
2308 return NULL;
2309 if (!PyTuple_Check(restuple)) {
2310 PyErr_Format(PyExc_TypeError, &argparse[4]);
2311 Py_DECREF(restuple);
2312 return NULL;
2313 }
2314 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2315 &resunicode, newpos)) {
2316 Py_DECREF(restuple);
2317 return NULL;
2318 }
2319 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002320 *newpos = size+*newpos;
2321 if (*newpos<0 || *newpos>size) {
2322 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2323 Py_DECREF(restuple);
2324 return NULL;
2325 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002326 Py_INCREF(resunicode);
2327 Py_DECREF(restuple);
2328 return resunicode;
2329}
2330
2331static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2332 int size,
2333 const char *errors,
2334 int limit)
2335{
2336 /* output object */
2337 PyObject *res;
2338 /* pointers to the beginning and end+1 of input */
2339 const Py_UNICODE *startp = p;
2340 const Py_UNICODE *endp = p + size;
2341 /* pointer to the beginning of the unencodable characters */
2342 /* const Py_UNICODE *badp = NULL; */
2343 /* pointer into the output */
2344 char *str;
2345 /* current output position */
2346 int respos = 0;
2347 int ressize;
2348 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2349 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2350 PyObject *errorHandler = NULL;
2351 PyObject *exc = NULL;
2352 /* the following variable is used for caching string comparisons
2353 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2354 int known_errorHandler = -1;
2355
2356 /* allocate enough for a simple encoding without
2357 replacements, if we need more, we'll resize */
2358 res = PyString_FromStringAndSize(NULL, size);
2359 if (res == NULL)
2360 goto onError;
2361 if (size == 0)
2362 return res;
2363 str = PyString_AS_STRING(res);
2364 ressize = size;
2365
2366 while (p<endp) {
2367 Py_UNICODE c = *p;
2368
2369 /* can we encode this? */
2370 if (c<limit) {
2371 /* no overflow check, because we know that the space is enough */
2372 *str++ = (char)c;
2373 ++p;
2374 }
2375 else {
2376 int unicodepos = p-startp;
2377 int requiredsize;
2378 PyObject *repunicode;
2379 int repsize;
2380 int newpos;
2381 int respos;
2382 Py_UNICODE *uni2;
2383 /* startpos for collecting unencodable chars */
2384 const Py_UNICODE *collstart = p;
2385 const Py_UNICODE *collend = p;
2386 /* find all unecodable characters */
2387 while ((collend < endp) && ((*collend)>=limit))
2388 ++collend;
2389 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2390 if (known_errorHandler==-1) {
2391 if ((errors==NULL) || (!strcmp(errors, "strict")))
2392 known_errorHandler = 1;
2393 else if (!strcmp(errors, "replace"))
2394 known_errorHandler = 2;
2395 else if (!strcmp(errors, "ignore"))
2396 known_errorHandler = 3;
2397 else if (!strcmp(errors, "xmlcharrefreplace"))
2398 known_errorHandler = 4;
2399 else
2400 known_errorHandler = 0;
2401 }
2402 switch (known_errorHandler) {
2403 case 1: /* strict */
2404 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2405 goto onError;
2406 case 2: /* replace */
2407 while (collstart++<collend)
2408 *str++ = '?'; /* fall through */
2409 case 3: /* ignore */
2410 p = collend;
2411 break;
2412 case 4: /* xmlcharrefreplace */
2413 respos = str-PyString_AS_STRING(res);
2414 /* determine replacement size (temporarily (mis)uses p) */
2415 for (p = collstart, repsize = 0; p < collend; ++p) {
2416 if (*p<10)
2417 repsize += 2+1+1;
2418 else if (*p<100)
2419 repsize += 2+2+1;
2420 else if (*p<1000)
2421 repsize += 2+3+1;
2422 else if (*p<10000)
2423 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002424#ifndef Py_UNICODE_WIDE
2425 else
2426 repsize += 2+5+1;
2427#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002428 else if (*p<100000)
2429 repsize += 2+5+1;
2430 else if (*p<1000000)
2431 repsize += 2+6+1;
2432 else
2433 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002434#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002435 }
2436 requiredsize = respos+repsize+(endp-collend);
2437 if (requiredsize > ressize) {
2438 if (requiredsize<2*ressize)
2439 requiredsize = 2*ressize;
2440 if (_PyString_Resize(&res, requiredsize))
2441 goto onError;
2442 str = PyString_AS_STRING(res) + respos;
2443 ressize = requiredsize;
2444 }
2445 /* generate replacement (temporarily (mis)uses p) */
2446 for (p = collstart; p < collend; ++p) {
2447 str += sprintf(str, "&#%d;", (int)*p);
2448 }
2449 p = collend;
2450 break;
2451 default:
2452 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2453 encoding, reason, startp, size, &exc,
2454 collstart-startp, collend-startp, &newpos);
2455 if (repunicode == NULL)
2456 goto onError;
2457 /* need more space? (at least enough for what we
2458 have+the replacement+the rest of the string, so
2459 we won't have to check space for encodable characters) */
2460 respos = str-PyString_AS_STRING(res);
2461 repsize = PyUnicode_GET_SIZE(repunicode);
2462 requiredsize = respos+repsize+(endp-collend);
2463 if (requiredsize > ressize) {
2464 if (requiredsize<2*ressize)
2465 requiredsize = 2*ressize;
2466 if (_PyString_Resize(&res, requiredsize)) {
2467 Py_DECREF(repunicode);
2468 goto onError;
2469 }
2470 str = PyString_AS_STRING(res) + respos;
2471 ressize = requiredsize;
2472 }
2473 /* check if there is anything unencodable in the replacement
2474 and copy it to the output */
2475 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2476 c = *uni2;
2477 if (c >= limit) {
2478 raise_encode_exception(&exc, encoding, startp, size,
2479 unicodepos, unicodepos+1, reason);
2480 Py_DECREF(repunicode);
2481 goto onError;
2482 }
2483 *str = (char)c;
2484 }
2485 p = startp + newpos;
2486 Py_DECREF(repunicode);
2487 }
2488 }
2489 }
2490 /* Resize if we allocated to much */
2491 respos = str-PyString_AS_STRING(res);
2492 if (respos<ressize)
2493 /* If this falls res will be NULL */
2494 _PyString_Resize(&res, respos);
2495 Py_XDECREF(errorHandler);
2496 Py_XDECREF(exc);
2497 return res;
2498
2499 onError:
2500 Py_XDECREF(res);
2501 Py_XDECREF(errorHandler);
2502 Py_XDECREF(exc);
2503 return NULL;
2504}
2505
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2507 int size,
2508 const char *errors)
2509{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002510 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511}
2512
2513PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2514{
2515 if (!PyUnicode_Check(unicode)) {
2516 PyErr_BadArgument();
2517 return NULL;
2518 }
2519 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2520 PyUnicode_GET_SIZE(unicode),
2521 NULL);
2522}
2523
2524/* --- 7-bit ASCII Codec -------------------------------------------------- */
2525
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526PyObject *PyUnicode_DecodeASCII(const char *s,
2527 int size,
2528 const char *errors)
2529{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002530 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531 PyUnicodeObject *v;
2532 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002533 int startinpos;
2534 int endinpos;
2535 int outpos;
2536 const char *e;
2537 PyObject *errorHandler = NULL;
2538 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002539
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002541 if (size == 1 && *(unsigned char*)s < 128) {
2542 Py_UNICODE r = *(unsigned char*)s;
2543 return PyUnicode_FromUnicode(&r, 1);
2544 }
Tim Petersced69f82003-09-16 20:30:58 +00002545
Guido van Rossumd57fd912000-03-10 22:53:23 +00002546 v = _PyUnicode_New(size);
2547 if (v == NULL)
2548 goto onError;
2549 if (size == 0)
2550 return (PyObject *)v;
2551 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002552 e = s + size;
2553 while (s < e) {
2554 register unsigned char c = (unsigned char)*s;
2555 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002557 ++s;
2558 }
2559 else {
2560 startinpos = s-starts;
2561 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002562 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002563 if (unicode_decode_call_errorhandler(
2564 errors, &errorHandler,
2565 "ascii", "ordinal not in range(128)",
2566 starts, size, &startinpos, &endinpos, &exc, &s,
2567 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002569 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002570 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002571 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002572 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002573 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002574 Py_XDECREF(errorHandler);
2575 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002577
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578 onError:
2579 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002580 Py_XDECREF(errorHandler);
2581 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582 return NULL;
2583}
2584
Guido van Rossumd57fd912000-03-10 22:53:23 +00002585PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2586 int size,
2587 const char *errors)
2588{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002589 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002590}
2591
2592PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2593{
2594 if (!PyUnicode_Check(unicode)) {
2595 PyErr_BadArgument();
2596 return NULL;
2597 }
2598 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2599 PyUnicode_GET_SIZE(unicode),
2600 NULL);
2601}
2602
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002603#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002604
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002605/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002606
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002607PyObject *PyUnicode_DecodeMBCS(const char *s,
2608 int size,
2609 const char *errors)
2610{
2611 PyUnicodeObject *v;
2612 Py_UNICODE *p;
2613
2614 /* First get the size of the result */
2615 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002616 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002617 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2618
2619 v = _PyUnicode_New(usize);
2620 if (v == NULL)
2621 return NULL;
2622 if (usize == 0)
2623 return (PyObject *)v;
2624 p = PyUnicode_AS_UNICODE(v);
2625 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2626 Py_DECREF(v);
2627 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2628 }
2629
2630 return (PyObject *)v;
2631}
2632
2633PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2634 int size,
2635 const char *errors)
2636{
2637 PyObject *repr;
2638 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002639 DWORD mbcssize;
2640
2641 /* If there are no characters, bail now! */
2642 if (size==0)
2643 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002644
2645 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002646 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002647 if (mbcssize==0)
2648 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2649
2650 repr = PyString_FromStringAndSize(NULL, mbcssize);
2651 if (repr == NULL)
2652 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002653 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002654 return repr;
2655
2656 /* Do the conversion */
2657 s = PyString_AS_STRING(repr);
2658 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2659 Py_DECREF(repr);
2660 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2661 }
2662 return repr;
2663}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002664
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002665PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2666{
2667 if (!PyUnicode_Check(unicode)) {
2668 PyErr_BadArgument();
2669 return NULL;
2670 }
2671 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2672 PyUnicode_GET_SIZE(unicode),
2673 NULL);
2674}
2675
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002676#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002677
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678/* --- Character Mapping Codec -------------------------------------------- */
2679
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680PyObject *PyUnicode_DecodeCharmap(const char *s,
2681 int size,
2682 PyObject *mapping,
2683 const char *errors)
2684{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002685 const char *starts = s;
2686 int startinpos;
2687 int endinpos;
2688 int outpos;
2689 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 PyUnicodeObject *v;
2691 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002692 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002693 PyObject *errorHandler = NULL;
2694 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002695
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 /* Default to Latin-1 */
2697 if (mapping == NULL)
2698 return PyUnicode_DecodeLatin1(s, size, errors);
2699
2700 v = _PyUnicode_New(size);
2701 if (v == NULL)
2702 goto onError;
2703 if (size == 0)
2704 return (PyObject *)v;
2705 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002706 e = s + size;
2707 while (s < e) {
2708 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709 PyObject *w, *x;
2710
2711 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2712 w = PyInt_FromLong((long)ch);
2713 if (w == NULL)
2714 goto onError;
2715 x = PyObject_GetItem(mapping, w);
2716 Py_DECREF(w);
2717 if (x == NULL) {
2718 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002719 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002721 x = Py_None;
2722 Py_INCREF(x);
2723 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002724 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725 }
2726
2727 /* Apply mapping */
2728 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002729 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730 if (value < 0 || value > 65535) {
2731 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002732 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733 Py_DECREF(x);
2734 goto onError;
2735 }
2736 *p++ = (Py_UNICODE)value;
2737 }
2738 else if (x == Py_None) {
2739 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002740 outpos = p-PyUnicode_AS_UNICODE(v);
2741 startinpos = s-starts;
2742 endinpos = startinpos+1;
2743 if (unicode_decode_call_errorhandler(
2744 errors, &errorHandler,
2745 "charmap", "character maps to <undefined>",
2746 starts, size, &startinpos, &endinpos, &exc, &s,
2747 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002748 Py_DECREF(x);
2749 goto onError;
2750 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002751 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752 }
2753 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002754 int targetsize = PyUnicode_GET_SIZE(x);
2755
2756 if (targetsize == 1)
2757 /* 1-1 mapping */
2758 *p++ = *PyUnicode_AS_UNICODE(x);
2759
2760 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002762 if (targetsize > extrachars) {
2763 /* resize first */
2764 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2765 int needed = (targetsize - extrachars) + \
2766 (targetsize << 2);
2767 extrachars += needed;
Tim Petersced69f82003-09-16 20:30:58 +00002768 if (_PyUnicode_Resize(&v,
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002769 PyUnicode_GET_SIZE(v) + needed) < 0) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002770 Py_DECREF(x);
2771 goto onError;
2772 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002773 p = PyUnicode_AS_UNICODE(v) + oldpos;
2774 }
2775 Py_UNICODE_COPY(p,
2776 PyUnicode_AS_UNICODE(x),
2777 targetsize);
2778 p += targetsize;
2779 extrachars -= targetsize;
2780 }
2781 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002782 }
2783 else {
2784 /* wrong return value */
2785 PyErr_SetString(PyExc_TypeError,
2786 "character mapping must return integer, None or unicode");
2787 Py_DECREF(x);
2788 goto onError;
2789 }
2790 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002791 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 }
2793 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002794 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002796 Py_XDECREF(errorHandler);
2797 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002799
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002801 Py_XDECREF(errorHandler);
2802 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 Py_XDECREF(v);
2804 return NULL;
2805}
2806
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002807/* Lookup the character ch in the mapping. If the character
2808 can't be found, Py_None is returned (or NULL, if another
2809 error occured). */
2810static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002812 PyObject *w = PyInt_FromLong((long)c);
2813 PyObject *x;
2814
2815 if (w == NULL)
2816 return NULL;
2817 x = PyObject_GetItem(mapping, w);
2818 Py_DECREF(w);
2819 if (x == NULL) {
2820 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2821 /* No mapping found means: mapping is undefined. */
2822 PyErr_Clear();
2823 x = Py_None;
2824 Py_INCREF(x);
2825 return x;
2826 } else
2827 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002829 else if (x == Py_None)
2830 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002831 else if (PyInt_Check(x)) {
2832 long value = PyInt_AS_LONG(x);
2833 if (value < 0 || value > 255) {
2834 PyErr_SetString(PyExc_TypeError,
2835 "character mapping must be in range(256)");
2836 Py_DECREF(x);
2837 return NULL;
2838 }
2839 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002841 else if (PyString_Check(x))
2842 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002844 /* wrong return value */
2845 PyErr_SetString(PyExc_TypeError,
2846 "character mapping must return integer, None or str");
2847 Py_DECREF(x);
2848 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849 }
2850}
2851
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002852/* lookup the character, put the result in the output string and adjust
2853 various state variables. Reallocate the output string if not enough
2854 space is available. Return a new reference to the object that
2855 was put in the output buffer, or Py_None, if the mapping was undefined
2856 (in which case no character was written) or NULL, if a
2857 reallocation error ocurred. The called must decref the result */
2858static
2859PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2860 PyObject **outobj, int *outpos)
2861{
2862 PyObject *rep = charmapencode_lookup(c, mapping);
2863
2864 if (rep==NULL)
2865 return NULL;
2866 else if (rep==Py_None)
2867 return rep;
2868 else {
2869 char *outstart = PyString_AS_STRING(*outobj);
2870 int outsize = PyString_GET_SIZE(*outobj);
2871 if (PyInt_Check(rep)) {
2872 int requiredsize = *outpos+1;
2873 if (outsize<requiredsize) {
2874 /* exponentially overallocate to minimize reallocations */
2875 if (requiredsize < 2*outsize)
2876 requiredsize = 2*outsize;
2877 if (_PyString_Resize(outobj, requiredsize)) {
2878 Py_DECREF(rep);
2879 return NULL;
2880 }
2881 outstart = PyString_AS_STRING(*outobj);
2882 }
2883 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2884 }
2885 else {
2886 const char *repchars = PyString_AS_STRING(rep);
2887 int repsize = PyString_GET_SIZE(rep);
2888 int requiredsize = *outpos+repsize;
2889 if (outsize<requiredsize) {
2890 /* exponentially overallocate to minimize reallocations */
2891 if (requiredsize < 2*outsize)
2892 requiredsize = 2*outsize;
2893 if (_PyString_Resize(outobj, requiredsize)) {
2894 Py_DECREF(rep);
2895 return NULL;
2896 }
2897 outstart = PyString_AS_STRING(*outobj);
2898 }
2899 memcpy(outstart + *outpos, repchars, repsize);
2900 *outpos += repsize;
2901 }
2902 }
2903 return rep;
2904}
2905
2906/* handle an error in PyUnicode_EncodeCharmap
2907 Return 0 on success, -1 on error */
2908static
2909int charmap_encoding_error(
2910 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2911 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002912 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002913 PyObject **res, int *respos)
2914{
2915 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2916 int repsize;
2917 int newpos;
2918 Py_UNICODE *uni2;
2919 /* startpos for collecting unencodable chars */
2920 int collstartpos = *inpos;
2921 int collendpos = *inpos+1;
2922 int collpos;
2923 char *encoding = "charmap";
2924 char *reason = "character maps to <undefined>";
2925
2926 PyObject *x;
2927 /* find all unencodable characters */
2928 while (collendpos < size) {
2929 x = charmapencode_lookup(p[collendpos], mapping);
2930 if (x==NULL)
2931 return -1;
2932 else if (x!=Py_None) {
2933 Py_DECREF(x);
2934 break;
2935 }
2936 Py_DECREF(x);
2937 ++collendpos;
2938 }
2939 /* cache callback name lookup
2940 * (if not done yet, i.e. it's the first error) */
2941 if (*known_errorHandler==-1) {
2942 if ((errors==NULL) || (!strcmp(errors, "strict")))
2943 *known_errorHandler = 1;
2944 else if (!strcmp(errors, "replace"))
2945 *known_errorHandler = 2;
2946 else if (!strcmp(errors, "ignore"))
2947 *known_errorHandler = 3;
2948 else if (!strcmp(errors, "xmlcharrefreplace"))
2949 *known_errorHandler = 4;
2950 else
2951 *known_errorHandler = 0;
2952 }
2953 switch (*known_errorHandler) {
2954 case 1: /* strict */
2955 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2956 return -1;
2957 case 2: /* replace */
2958 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2959 x = charmapencode_output('?', mapping, res, respos);
2960 if (x==NULL) {
2961 return -1;
2962 }
2963 else if (x==Py_None) {
2964 Py_DECREF(x);
2965 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2966 return -1;
2967 }
2968 Py_DECREF(x);
2969 }
2970 /* fall through */
2971 case 3: /* ignore */
2972 *inpos = collendpos;
2973 break;
2974 case 4: /* xmlcharrefreplace */
2975 /* generate replacement (temporarily (mis)uses p) */
2976 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2977 char buffer[2+29+1+1];
2978 char *cp;
2979 sprintf(buffer, "&#%d;", (int)p[collpos]);
2980 for (cp = buffer; *cp; ++cp) {
2981 x = charmapencode_output(*cp, mapping, res, respos);
2982 if (x==NULL)
2983 return -1;
2984 else if (x==Py_None) {
2985 Py_DECREF(x);
2986 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2987 return -1;
2988 }
2989 Py_DECREF(x);
2990 }
2991 }
2992 *inpos = collendpos;
2993 break;
2994 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002995 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002996 encoding, reason, p, size, exceptionObject,
2997 collstartpos, collendpos, &newpos);
2998 if (repunicode == NULL)
2999 return -1;
3000 /* generate replacement */
3001 repsize = PyUnicode_GET_SIZE(repunicode);
3002 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3003 x = charmapencode_output(*uni2, mapping, res, respos);
3004 if (x==NULL) {
3005 Py_DECREF(repunicode);
3006 return -1;
3007 }
3008 else if (x==Py_None) {
3009 Py_DECREF(repunicode);
3010 Py_DECREF(x);
3011 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3012 return -1;
3013 }
3014 Py_DECREF(x);
3015 }
3016 *inpos = newpos;
3017 Py_DECREF(repunicode);
3018 }
3019 return 0;
3020}
3021
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3023 int size,
3024 PyObject *mapping,
3025 const char *errors)
3026{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003027 /* output object */
3028 PyObject *res = NULL;
3029 /* current input position */
3030 int inpos = 0;
3031 /* current output position */
3032 int respos = 0;
3033 PyObject *errorHandler = NULL;
3034 PyObject *exc = NULL;
3035 /* the following variable is used for caching string comparisons
3036 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3037 * 3=ignore, 4=xmlcharrefreplace */
3038 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039
3040 /* Default to Latin-1 */
3041 if (mapping == NULL)
3042 return PyUnicode_EncodeLatin1(p, size, errors);
3043
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003044 /* allocate enough for a simple encoding without
3045 replacements, if we need more, we'll resize */
3046 res = PyString_FromStringAndSize(NULL, size);
3047 if (res == NULL)
3048 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003049 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003050 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003052 while (inpos<size) {
3053 /* try to encode it */
3054 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3055 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003057 if (x==Py_None) { /* unencodable character */
3058 if (charmap_encoding_error(p, size, &inpos, mapping,
3059 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003060 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003061 &res, &respos)) {
3062 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003063 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003064 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003066 else
3067 /* done with this character => adjust input position */
3068 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069 Py_DECREF(x);
3070 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003072 /* Resize if we allocated to much */
3073 if (respos<PyString_GET_SIZE(res)) {
3074 if (_PyString_Resize(&res, respos))
3075 goto onError;
3076 }
3077 Py_XDECREF(exc);
3078 Py_XDECREF(errorHandler);
3079 return res;
3080
3081 onError:
3082 Py_XDECREF(res);
3083 Py_XDECREF(exc);
3084 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 return NULL;
3086}
3087
3088PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3089 PyObject *mapping)
3090{
3091 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3092 PyErr_BadArgument();
3093 return NULL;
3094 }
3095 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3096 PyUnicode_GET_SIZE(unicode),
3097 mapping,
3098 NULL);
3099}
3100
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003101/* create or adjust a UnicodeTranslateError */
3102static void make_translate_exception(PyObject **exceptionObject,
3103 const Py_UNICODE *unicode, int size,
3104 int startpos, int endpos,
3105 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003106{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003107 if (*exceptionObject == NULL) {
3108 *exceptionObject = PyUnicodeTranslateError_Create(
3109 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110 }
3111 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003112 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3113 goto onError;
3114 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3115 goto onError;
3116 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3117 goto onError;
3118 return;
3119 onError:
3120 Py_DECREF(*exceptionObject);
3121 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122 }
3123}
3124
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003125/* raises a UnicodeTranslateError */
3126static void raise_translate_exception(PyObject **exceptionObject,
3127 const Py_UNICODE *unicode, int size,
3128 int startpos, int endpos,
3129 const char *reason)
3130{
3131 make_translate_exception(exceptionObject,
3132 unicode, size, startpos, endpos, reason);
3133 if (*exceptionObject != NULL)
3134 PyCodec_StrictErrors(*exceptionObject);
3135}
3136
3137/* error handling callback helper:
3138 build arguments, call the callback and check the arguments,
3139 put the result into newpos and return the replacement string, which
3140 has to be freed by the caller */
3141static PyObject *unicode_translate_call_errorhandler(const char *errors,
3142 PyObject **errorHandler,
3143 const char *reason,
3144 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3145 int startpos, int endpos,
3146 int *newpos)
3147{
3148 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3149
3150 PyObject *restuple;
3151 PyObject *resunicode;
3152
3153 if (*errorHandler == NULL) {
3154 *errorHandler = PyCodec_LookupError(errors);
3155 if (*errorHandler == NULL)
3156 return NULL;
3157 }
3158
3159 make_translate_exception(exceptionObject,
3160 unicode, size, startpos, endpos, reason);
3161 if (*exceptionObject == NULL)
3162 return NULL;
3163
3164 restuple = PyObject_CallFunctionObjArgs(
3165 *errorHandler, *exceptionObject, NULL);
3166 if (restuple == NULL)
3167 return NULL;
3168 if (!PyTuple_Check(restuple)) {
3169 PyErr_Format(PyExc_TypeError, &argparse[4]);
3170 Py_DECREF(restuple);
3171 return NULL;
3172 }
3173 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3174 &resunicode, newpos)) {
3175 Py_DECREF(restuple);
3176 return NULL;
3177 }
3178 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003179 *newpos = size+*newpos;
3180 if (*newpos<0 || *newpos>size) {
3181 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3182 Py_DECREF(restuple);
3183 return NULL;
3184 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003185 Py_INCREF(resunicode);
3186 Py_DECREF(restuple);
3187 return resunicode;
3188}
3189
3190/* Lookup the character ch in the mapping and put the result in result,
3191 which must be decrefed by the caller.
3192 Return 0 on success, -1 on error */
3193static
3194int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3195{
3196 PyObject *w = PyInt_FromLong((long)c);
3197 PyObject *x;
3198
3199 if (w == NULL)
3200 return -1;
3201 x = PyObject_GetItem(mapping, w);
3202 Py_DECREF(w);
3203 if (x == NULL) {
3204 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3205 /* No mapping found means: use 1:1 mapping. */
3206 PyErr_Clear();
3207 *result = NULL;
3208 return 0;
3209 } else
3210 return -1;
3211 }
3212 else if (x == Py_None) {
3213 *result = x;
3214 return 0;
3215 }
3216 else if (PyInt_Check(x)) {
3217 long value = PyInt_AS_LONG(x);
3218 long max = PyUnicode_GetMax();
3219 if (value < 0 || value > max) {
3220 PyErr_Format(PyExc_TypeError,
3221 "character mapping must be in range(0x%lx)", max+1);
3222 Py_DECREF(x);
3223 return -1;
3224 }
3225 *result = x;
3226 return 0;
3227 }
3228 else if (PyUnicode_Check(x)) {
3229 *result = x;
3230 return 0;
3231 }
3232 else {
3233 /* wrong return value */
3234 PyErr_SetString(PyExc_TypeError,
3235 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003236 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003237 return -1;
3238 }
3239}
3240/* ensure that *outobj is at least requiredsize characters long,
3241if not reallocate and adjust various state variables.
3242Return 0 on success, -1 on error */
3243static
Walter Dörwald4894c302003-10-24 14:25:28 +00003244int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003245 int requiredsize)
3246{
Walter Dörwald4894c302003-10-24 14:25:28 +00003247 int oldsize = PyUnicode_GET_SIZE(*outobj);
3248 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003249 /* remember old output position */
3250 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3251 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003252 if (requiredsize < 2 * oldsize)
3253 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003254 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003255 return -1;
3256 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003257 }
3258 return 0;
3259}
3260/* lookup the character, put the result in the output string and adjust
3261 various state variables. Return a new reference to the object that
3262 was put in the output buffer in *result, or Py_None, if the mapping was
3263 undefined (in which case no character was written).
3264 The called must decref result.
3265 Return 0 on success, -1 on error. */
3266static
Walter Dörwald4894c302003-10-24 14:25:28 +00003267int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3268 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3269 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003270{
Walter Dörwald4894c302003-10-24 14:25:28 +00003271 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003272 return -1;
3273 if (*res==NULL) {
3274 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003275 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003276 }
3277 else if (*res==Py_None)
3278 ;
3279 else if (PyInt_Check(*res)) {
3280 /* no overflow check, because we know that the space is enough */
3281 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3282 }
3283 else if (PyUnicode_Check(*res)) {
3284 int repsize = PyUnicode_GET_SIZE(*res);
3285 if (repsize==1) {
3286 /* no overflow check, because we know that the space is enough */
3287 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3288 }
3289 else if (repsize!=0) {
3290 /* more than one character */
Walter Dörwald4894c302003-10-24 14:25:28 +00003291 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003292 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003293 repsize - 1;
3294 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003295 return -1;
3296 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3297 *outp += repsize;
3298 }
3299 }
3300 else
3301 return -1;
3302 return 0;
3303}
3304
3305PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 int size,
3307 PyObject *mapping,
3308 const char *errors)
3309{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003310 /* output object */
3311 PyObject *res = NULL;
3312 /* pointers to the beginning and end+1 of input */
3313 const Py_UNICODE *startp = p;
3314 const Py_UNICODE *endp = p + size;
3315 /* pointer into the output */
3316 Py_UNICODE *str;
3317 /* current output position */
3318 int respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003319 char *reason = "character maps to <undefined>";
3320 PyObject *errorHandler = NULL;
3321 PyObject *exc = NULL;
3322 /* the following variable is used for caching string comparisons
3323 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3324 * 3=ignore, 4=xmlcharrefreplace */
3325 int known_errorHandler = -1;
3326
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 if (mapping == NULL) {
3328 PyErr_BadArgument();
3329 return NULL;
3330 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003331
3332 /* allocate enough for a simple 1:1 translation without
3333 replacements, if we need more, we'll resize */
3334 res = PyUnicode_FromUnicode(NULL, size);
3335 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003336 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003338 return res;
3339 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003341 while (p<endp) {
3342 /* try to encode it */
3343 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003344 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003345 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346 goto onError;
3347 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003348 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003349 if (x!=Py_None) /* it worked => adjust input pointer */
3350 ++p;
3351 else { /* untranslatable character */
3352 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3353 int repsize;
3354 int newpos;
3355 Py_UNICODE *uni2;
3356 /* startpos for collecting untranslatable chars */
3357 const Py_UNICODE *collstart = p;
3358 const Py_UNICODE *collend = p+1;
3359 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003361 /* find all untranslatable characters */
3362 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003363 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003364 goto onError;
3365 Py_XDECREF(x);
3366 if (x!=Py_None)
3367 break;
3368 ++collend;
3369 }
3370 /* cache callback name lookup
3371 * (if not done yet, i.e. it's the first error) */
3372 if (known_errorHandler==-1) {
3373 if ((errors==NULL) || (!strcmp(errors, "strict")))
3374 known_errorHandler = 1;
3375 else if (!strcmp(errors, "replace"))
3376 known_errorHandler = 2;
3377 else if (!strcmp(errors, "ignore"))
3378 known_errorHandler = 3;
3379 else if (!strcmp(errors, "xmlcharrefreplace"))
3380 known_errorHandler = 4;
3381 else
3382 known_errorHandler = 0;
3383 }
3384 switch (known_errorHandler) {
3385 case 1: /* strict */
3386 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3387 goto onError;
3388 case 2: /* replace */
3389 /* No need to check for space, this is a 1:1 replacement */
3390 for (coll = collstart; coll<collend; ++coll)
3391 *str++ = '?';
3392 /* fall through */
3393 case 3: /* ignore */
3394 p = collend;
3395 break;
3396 case 4: /* xmlcharrefreplace */
3397 /* generate replacement (temporarily (mis)uses p) */
3398 for (p = collstart; p < collend; ++p) {
3399 char buffer[2+29+1+1];
3400 char *cp;
3401 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003402 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003403 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3404 goto onError;
3405 for (cp = buffer; *cp; ++cp)
3406 *str++ = *cp;
3407 }
3408 p = collend;
3409 break;
3410 default:
3411 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3412 reason, startp, size, &exc,
3413 collstart-startp, collend-startp, &newpos);
3414 if (repunicode == NULL)
3415 goto onError;
3416 /* generate replacement */
3417 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003418 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3420 Py_DECREF(repunicode);
3421 goto onError;
3422 }
3423 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3424 *str++ = *uni2;
3425 p = startp + newpos;
3426 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003427 }
3428 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 /* Resize if we allocated to much */
3431 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003432 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003433 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003434 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003435 }
3436 Py_XDECREF(exc);
3437 Py_XDECREF(errorHandler);
3438 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003439
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003440 onError:
3441 Py_XDECREF(res);
3442 Py_XDECREF(exc);
3443 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444 return NULL;
3445}
3446
3447PyObject *PyUnicode_Translate(PyObject *str,
3448 PyObject *mapping,
3449 const char *errors)
3450{
3451 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003452
Guido van Rossumd57fd912000-03-10 22:53:23 +00003453 str = PyUnicode_FromObject(str);
3454 if (str == NULL)
3455 goto onError;
3456 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3457 PyUnicode_GET_SIZE(str),
3458 mapping,
3459 errors);
3460 Py_DECREF(str);
3461 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003462
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 onError:
3464 Py_XDECREF(str);
3465 return NULL;
3466}
Tim Petersced69f82003-09-16 20:30:58 +00003467
Guido van Rossum9e896b32000-04-05 20:11:21 +00003468/* --- Decimal Encoder ---------------------------------------------------- */
3469
3470int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3471 int length,
3472 char *output,
3473 const char *errors)
3474{
3475 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003476 PyObject *errorHandler = NULL;
3477 PyObject *exc = NULL;
3478 const char *encoding = "decimal";
3479 const char *reason = "invalid decimal Unicode string";
3480 /* the following variable is used for caching string comparisons
3481 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3482 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003483
3484 if (output == NULL) {
3485 PyErr_BadArgument();
3486 return -1;
3487 }
3488
3489 p = s;
3490 end = s + length;
3491 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003492 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003493 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003494 PyObject *repunicode;
3495 int repsize;
3496 int newpos;
3497 Py_UNICODE *uni2;
3498 Py_UNICODE *collstart;
3499 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003500
Guido van Rossum9e896b32000-04-05 20:11:21 +00003501 if (Py_UNICODE_ISSPACE(ch)) {
3502 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003504 continue;
3505 }
3506 decimal = Py_UNICODE_TODECIMAL(ch);
3507 if (decimal >= 0) {
3508 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003510 continue;
3511 }
Guido van Rossumba477042000-04-06 18:18:10 +00003512 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003513 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003514 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003515 continue;
3516 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003517 /* All other characters are considered unencodable */
3518 collstart = p;
3519 collend = p+1;
3520 while (collend < end) {
3521 if ((0 < *collend && *collend < 256) ||
3522 !Py_UNICODE_ISSPACE(*collend) ||
3523 Py_UNICODE_TODECIMAL(*collend))
3524 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003525 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526 /* cache callback name lookup
3527 * (if not done yet, i.e. it's the first error) */
3528 if (known_errorHandler==-1) {
3529 if ((errors==NULL) || (!strcmp(errors, "strict")))
3530 known_errorHandler = 1;
3531 else if (!strcmp(errors, "replace"))
3532 known_errorHandler = 2;
3533 else if (!strcmp(errors, "ignore"))
3534 known_errorHandler = 3;
3535 else if (!strcmp(errors, "xmlcharrefreplace"))
3536 known_errorHandler = 4;
3537 else
3538 known_errorHandler = 0;
3539 }
3540 switch (known_errorHandler) {
3541 case 1: /* strict */
3542 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3543 goto onError;
3544 case 2: /* replace */
3545 for (p = collstart; p < collend; ++p)
3546 *output++ = '?';
3547 /* fall through */
3548 case 3: /* ignore */
3549 p = collend;
3550 break;
3551 case 4: /* xmlcharrefreplace */
3552 /* generate replacement (temporarily (mis)uses p) */
3553 for (p = collstart; p < collend; ++p)
3554 output += sprintf(output, "&#%d;", (int)*p);
3555 p = collend;
3556 break;
3557 default:
3558 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3559 encoding, reason, s, length, &exc,
3560 collstart-s, collend-s, &newpos);
3561 if (repunicode == NULL)
3562 goto onError;
3563 /* generate replacement */
3564 repsize = PyUnicode_GET_SIZE(repunicode);
3565 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3566 Py_UNICODE ch = *uni2;
3567 if (Py_UNICODE_ISSPACE(ch))
3568 *output++ = ' ';
3569 else {
3570 decimal = Py_UNICODE_TODECIMAL(ch);
3571 if (decimal >= 0)
3572 *output++ = '0' + decimal;
3573 else if (0 < ch && ch < 256)
3574 *output++ = (char)ch;
3575 else {
3576 Py_DECREF(repunicode);
3577 raise_encode_exception(&exc, encoding,
3578 s, length, collstart-s, collend-s, reason);
3579 goto onError;
3580 }
3581 }
3582 }
3583 p = s + newpos;
3584 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003585 }
3586 }
3587 /* 0-terminate the output string */
3588 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003589 Py_XDECREF(exc);
3590 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003591 return 0;
3592
3593 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 Py_XDECREF(exc);
3595 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003596 return -1;
3597}
3598
Guido van Rossumd57fd912000-03-10 22:53:23 +00003599/* --- Helpers ------------------------------------------------------------ */
3600
Tim Petersced69f82003-09-16 20:30:58 +00003601static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003602int count(PyUnicodeObject *self,
3603 int start,
3604 int end,
3605 PyUnicodeObject *substring)
3606{
3607 int count = 0;
3608
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003609 if (start < 0)
3610 start += self->length;
3611 if (start < 0)
3612 start = 0;
3613 if (end > self->length)
3614 end = self->length;
3615 if (end < 0)
3616 end += self->length;
3617 if (end < 0)
3618 end = 0;
3619
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003620 if (substring->length == 0)
3621 return (end - start + 1);
3622
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623 end -= substring->length;
3624
3625 while (start <= end)
3626 if (Py_UNICODE_MATCH(self, start, substring)) {
3627 count++;
3628 start += substring->length;
3629 } else
3630 start++;
3631
3632 return count;
3633}
3634
3635int PyUnicode_Count(PyObject *str,
3636 PyObject *substr,
3637 int start,
3638 int end)
3639{
3640 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003641
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642 str = PyUnicode_FromObject(str);
3643 if (str == NULL)
3644 return -1;
3645 substr = PyUnicode_FromObject(substr);
3646 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003647 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003648 return -1;
3649 }
Tim Petersced69f82003-09-16 20:30:58 +00003650
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 result = count((PyUnicodeObject *)str,
3652 start, end,
3653 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003654
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655 Py_DECREF(str);
3656 Py_DECREF(substr);
3657 return result;
3658}
3659
Tim Petersced69f82003-09-16 20:30:58 +00003660static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661int findstring(PyUnicodeObject *self,
3662 PyUnicodeObject *substring,
3663 int start,
3664 int end,
3665 int direction)
3666{
3667 if (start < 0)
3668 start += self->length;
3669 if (start < 0)
3670 start = 0;
3671
Guido van Rossumd57fd912000-03-10 22:53:23 +00003672 if (end > self->length)
3673 end = self->length;
3674 if (end < 0)
3675 end += self->length;
3676 if (end < 0)
3677 end = 0;
3678
Guido van Rossum76afbd92002-08-20 17:29:29 +00003679 if (substring->length == 0)
3680 return (direction > 0) ? start : end;
3681
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 end -= substring->length;
3683
3684 if (direction < 0) {
3685 for (; end >= start; end--)
3686 if (Py_UNICODE_MATCH(self, end, substring))
3687 return end;
3688 } else {
3689 for (; start <= end; start++)
3690 if (Py_UNICODE_MATCH(self, start, substring))
3691 return start;
3692 }
3693
3694 return -1;
3695}
3696
3697int PyUnicode_Find(PyObject *str,
3698 PyObject *substr,
3699 int start,
3700 int end,
3701 int direction)
3702{
3703 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003704
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705 str = PyUnicode_FromObject(str);
3706 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003707 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 substr = PyUnicode_FromObject(substr);
3709 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003710 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003711 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003712 }
Tim Petersced69f82003-09-16 20:30:58 +00003713
Guido van Rossumd57fd912000-03-10 22:53:23 +00003714 result = findstring((PyUnicodeObject *)str,
3715 (PyUnicodeObject *)substr,
3716 start, end, direction);
3717 Py_DECREF(str);
3718 Py_DECREF(substr);
3719 return result;
3720}
3721
Tim Petersced69f82003-09-16 20:30:58 +00003722static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003723int tailmatch(PyUnicodeObject *self,
3724 PyUnicodeObject *substring,
3725 int start,
3726 int end,
3727 int direction)
3728{
3729 if (start < 0)
3730 start += self->length;
3731 if (start < 0)
3732 start = 0;
3733
3734 if (substring->length == 0)
3735 return 1;
3736
3737 if (end > self->length)
3738 end = self->length;
3739 if (end < 0)
3740 end += self->length;
3741 if (end < 0)
3742 end = 0;
3743
3744 end -= substring->length;
3745 if (end < start)
3746 return 0;
3747
3748 if (direction > 0) {
3749 if (Py_UNICODE_MATCH(self, end, substring))
3750 return 1;
3751 } else {
3752 if (Py_UNICODE_MATCH(self, start, substring))
3753 return 1;
3754 }
3755
3756 return 0;
3757}
3758
3759int PyUnicode_Tailmatch(PyObject *str,
3760 PyObject *substr,
3761 int start,
3762 int end,
3763 int direction)
3764{
3765 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003766
Guido van Rossumd57fd912000-03-10 22:53:23 +00003767 str = PyUnicode_FromObject(str);
3768 if (str == NULL)
3769 return -1;
3770 substr = PyUnicode_FromObject(substr);
3771 if (substr == NULL) {
3772 Py_DECREF(substr);
3773 return -1;
3774 }
Tim Petersced69f82003-09-16 20:30:58 +00003775
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 result = tailmatch((PyUnicodeObject *)str,
3777 (PyUnicodeObject *)substr,
3778 start, end, direction);
3779 Py_DECREF(str);
3780 Py_DECREF(substr);
3781 return result;
3782}
3783
Tim Petersced69f82003-09-16 20:30:58 +00003784static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785const Py_UNICODE *findchar(const Py_UNICODE *s,
3786 int size,
3787 Py_UNICODE ch)
3788{
3789 /* like wcschr, but doesn't stop at NULL characters */
3790
3791 while (size-- > 0) {
3792 if (*s == ch)
3793 return s;
3794 s++;
3795 }
3796
3797 return NULL;
3798}
3799
3800/* Apply fixfct filter to the Unicode object self and return a
3801 reference to the modified object */
3802
Tim Petersced69f82003-09-16 20:30:58 +00003803static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804PyObject *fixup(PyUnicodeObject *self,
3805 int (*fixfct)(PyUnicodeObject *s))
3806{
3807
3808 PyUnicodeObject *u;
3809
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003810 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811 if (u == NULL)
3812 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003813
3814 Py_UNICODE_COPY(u->str, self->str, self->length);
3815
Tim Peters7a29bd52001-09-12 03:03:31 +00003816 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 /* fixfct should return TRUE if it modified the buffer. If
3818 FALSE, return a reference to the original buffer instead
3819 (to save space, not time) */
3820 Py_INCREF(self);
3821 Py_DECREF(u);
3822 return (PyObject*) self;
3823 }
3824 return (PyObject*) u;
3825}
3826
Tim Petersced69f82003-09-16 20:30:58 +00003827static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828int fixupper(PyUnicodeObject *self)
3829{
3830 int len = self->length;
3831 Py_UNICODE *s = self->str;
3832 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003833
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834 while (len-- > 0) {
3835 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003836
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 ch = Py_UNICODE_TOUPPER(*s);
3838 if (ch != *s) {
3839 status = 1;
3840 *s = ch;
3841 }
3842 s++;
3843 }
3844
3845 return status;
3846}
3847
Tim Petersced69f82003-09-16 20:30:58 +00003848static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849int fixlower(PyUnicodeObject *self)
3850{
3851 int len = self->length;
3852 Py_UNICODE *s = self->str;
3853 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003854
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855 while (len-- > 0) {
3856 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003857
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858 ch = Py_UNICODE_TOLOWER(*s);
3859 if (ch != *s) {
3860 status = 1;
3861 *s = ch;
3862 }
3863 s++;
3864 }
3865
3866 return status;
3867}
3868
Tim Petersced69f82003-09-16 20:30:58 +00003869static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870int fixswapcase(PyUnicodeObject *self)
3871{
3872 int len = self->length;
3873 Py_UNICODE *s = self->str;
3874 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003875
Guido van Rossumd57fd912000-03-10 22:53:23 +00003876 while (len-- > 0) {
3877 if (Py_UNICODE_ISUPPER(*s)) {
3878 *s = Py_UNICODE_TOLOWER(*s);
3879 status = 1;
3880 } else if (Py_UNICODE_ISLOWER(*s)) {
3881 *s = Py_UNICODE_TOUPPER(*s);
3882 status = 1;
3883 }
3884 s++;
3885 }
3886
3887 return status;
3888}
3889
Tim Petersced69f82003-09-16 20:30:58 +00003890static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891int fixcapitalize(PyUnicodeObject *self)
3892{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003893 int len = self->length;
3894 Py_UNICODE *s = self->str;
3895 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003896
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003897 if (len == 0)
3898 return 0;
3899 if (Py_UNICODE_ISLOWER(*s)) {
3900 *s = Py_UNICODE_TOUPPER(*s);
3901 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003902 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003903 s++;
3904 while (--len > 0) {
3905 if (Py_UNICODE_ISUPPER(*s)) {
3906 *s = Py_UNICODE_TOLOWER(*s);
3907 status = 1;
3908 }
3909 s++;
3910 }
3911 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003912}
3913
3914static
3915int fixtitle(PyUnicodeObject *self)
3916{
3917 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3918 register Py_UNICODE *e;
3919 int previous_is_cased;
3920
3921 /* Shortcut for single character strings */
3922 if (PyUnicode_GET_SIZE(self) == 1) {
3923 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3924 if (*p != ch) {
3925 *p = ch;
3926 return 1;
3927 }
3928 else
3929 return 0;
3930 }
Tim Petersced69f82003-09-16 20:30:58 +00003931
Guido van Rossumd57fd912000-03-10 22:53:23 +00003932 e = p + PyUnicode_GET_SIZE(self);
3933 previous_is_cased = 0;
3934 for (; p < e; p++) {
3935 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00003936
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937 if (previous_is_cased)
3938 *p = Py_UNICODE_TOLOWER(ch);
3939 else
3940 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00003941
3942 if (Py_UNICODE_ISLOWER(ch) ||
3943 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944 Py_UNICODE_ISTITLE(ch))
3945 previous_is_cased = 1;
3946 else
3947 previous_is_cased = 0;
3948 }
3949 return 1;
3950}
3951
3952PyObject *PyUnicode_Join(PyObject *separator,
3953 PyObject *seq)
3954{
3955 Py_UNICODE *sep;
3956 int seplen;
3957 PyUnicodeObject *res = NULL;
3958 int reslen = 0;
3959 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960 int sz = 100;
3961 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003962 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963
Tim Peters2cfe3682001-05-05 05:36:48 +00003964 it = PyObject_GetIter(seq);
3965 if (it == NULL)
3966 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967
3968 if (separator == NULL) {
3969 Py_UNICODE blank = ' ';
3970 sep = &blank;
3971 seplen = 1;
3972 }
3973 else {
3974 separator = PyUnicode_FromObject(separator);
3975 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003976 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003977 sep = PyUnicode_AS_UNICODE(separator);
3978 seplen = PyUnicode_GET_SIZE(separator);
3979 }
Tim Petersced69f82003-09-16 20:30:58 +00003980
Guido van Rossumd57fd912000-03-10 22:53:23 +00003981 res = _PyUnicode_New(sz);
3982 if (res == NULL)
3983 goto onError;
3984 p = PyUnicode_AS_UNICODE(res);
3985 reslen = 0;
3986
Tim Peters2cfe3682001-05-05 05:36:48 +00003987 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003989 PyObject *item = PyIter_Next(it);
3990 if (item == NULL) {
3991 if (PyErr_Occurred())
3992 goto onError;
3993 break;
3994 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995 if (!PyUnicode_Check(item)) {
3996 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003997 if (!PyString_Check(item)) {
3998 PyErr_Format(PyExc_TypeError,
3999 "sequence item %i: expected string or Unicode,"
4000 " %.80s found",
4001 i, item->ob_type->tp_name);
4002 Py_DECREF(item);
4003 goto onError;
4004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005 v = PyUnicode_FromObject(item);
4006 Py_DECREF(item);
4007 item = v;
4008 if (item == NULL)
4009 goto onError;
4010 }
4011 itemlen = PyUnicode_GET_SIZE(item);
4012 while (reslen + itemlen + seplen >= sz) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004013 if (_PyUnicode_Resize(&res, sz*2) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004014 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004017 sz *= 2;
4018 p = PyUnicode_AS_UNICODE(res) + reslen;
4019 }
4020 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004021 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004022 p += seplen;
4023 reslen += seplen;
4024 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004025 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026 p += itemlen;
4027 reslen += itemlen;
4028 Py_DECREF(item);
4029 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004030 if (_PyUnicode_Resize(&res, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031 goto onError;
4032
4033 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004034 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035 return (PyObject *)res;
4036
4037 onError:
4038 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004039 Py_XDECREF(res);
4040 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004041 return NULL;
4042}
4043
Tim Petersced69f82003-09-16 20:30:58 +00004044static
4045PyUnicodeObject *pad(PyUnicodeObject *self,
4046 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004047 int right,
4048 Py_UNICODE fill)
4049{
4050 PyUnicodeObject *u;
4051
4052 if (left < 0)
4053 left = 0;
4054 if (right < 0)
4055 right = 0;
4056
Tim Peters7a29bd52001-09-12 03:03:31 +00004057 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058 Py_INCREF(self);
4059 return self;
4060 }
4061
4062 u = _PyUnicode_New(left + self->length + right);
4063 if (u) {
4064 if (left)
4065 Py_UNICODE_FILL(u->str, fill, left);
4066 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4067 if (right)
4068 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4069 }
4070
4071 return u;
4072}
4073
4074#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004075 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076 if (!str) \
4077 goto onError; \
4078 if (PyList_Append(list, str)) { \
4079 Py_DECREF(str); \
4080 goto onError; \
4081 } \
4082 else \
4083 Py_DECREF(str);
4084
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004085#define SPLIT_INSERT(data, left, right) \
4086 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4087 if (!str) \
4088 goto onError; \
4089 if (PyList_Insert(list, 0, str)) { \
4090 Py_DECREF(str); \
4091 goto onError; \
4092 } \
4093 else \
4094 Py_DECREF(str);
4095
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096static
4097PyObject *split_whitespace(PyUnicodeObject *self,
4098 PyObject *list,
4099 int maxcount)
4100{
4101 register int i;
4102 register int j;
4103 int len = self->length;
4104 PyObject *str;
4105
4106 for (i = j = 0; i < len; ) {
4107 /* find a token */
4108 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4109 i++;
4110 j = i;
4111 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4112 i++;
4113 if (j < i) {
4114 if (maxcount-- <= 0)
4115 break;
4116 SPLIT_APPEND(self->str, j, i);
4117 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4118 i++;
4119 j = i;
4120 }
4121 }
4122 if (j < len) {
4123 SPLIT_APPEND(self->str, j, len);
4124 }
4125 return list;
4126
4127 onError:
4128 Py_DECREF(list);
4129 return NULL;
4130}
4131
4132PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004133 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134{
4135 register int i;
4136 register int j;
4137 int len;
4138 PyObject *list;
4139 PyObject *str;
4140 Py_UNICODE *data;
4141
4142 string = PyUnicode_FromObject(string);
4143 if (string == NULL)
4144 return NULL;
4145 data = PyUnicode_AS_UNICODE(string);
4146 len = PyUnicode_GET_SIZE(string);
4147
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148 list = PyList_New(0);
4149 if (!list)
4150 goto onError;
4151
4152 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004153 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004154
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155 /* Find a line and append it */
4156 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4157 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004158
4159 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004160 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004161 if (i < len) {
4162 if (data[i] == '\r' && i + 1 < len &&
4163 data[i+1] == '\n')
4164 i += 2;
4165 else
4166 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004167 if (keepends)
4168 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169 }
Guido van Rossum86662912000-04-11 15:38:46 +00004170 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171 j = i;
4172 }
4173 if (j < len) {
4174 SPLIT_APPEND(data, j, len);
4175 }
4176
4177 Py_DECREF(string);
4178 return list;
4179
4180 onError:
4181 Py_DECREF(list);
4182 Py_DECREF(string);
4183 return NULL;
4184}
4185
Tim Petersced69f82003-09-16 20:30:58 +00004186static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004187PyObject *split_char(PyUnicodeObject *self,
4188 PyObject *list,
4189 Py_UNICODE ch,
4190 int maxcount)
4191{
4192 register int i;
4193 register int j;
4194 int len = self->length;
4195 PyObject *str;
4196
4197 for (i = j = 0; i < len; ) {
4198 if (self->str[i] == ch) {
4199 if (maxcount-- <= 0)
4200 break;
4201 SPLIT_APPEND(self->str, j, i);
4202 i = j = i + 1;
4203 } else
4204 i++;
4205 }
4206 if (j <= len) {
4207 SPLIT_APPEND(self->str, j, len);
4208 }
4209 return list;
4210
4211 onError:
4212 Py_DECREF(list);
4213 return NULL;
4214}
4215
Tim Petersced69f82003-09-16 20:30:58 +00004216static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004217PyObject *split_substring(PyUnicodeObject *self,
4218 PyObject *list,
4219 PyUnicodeObject *substring,
4220 int maxcount)
4221{
4222 register int i;
4223 register int j;
4224 int len = self->length;
4225 int sublen = substring->length;
4226 PyObject *str;
4227
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004228 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004229 if (Py_UNICODE_MATCH(self, i, substring)) {
4230 if (maxcount-- <= 0)
4231 break;
4232 SPLIT_APPEND(self->str, j, i);
4233 i = j = i + sublen;
4234 } else
4235 i++;
4236 }
4237 if (j <= len) {
4238 SPLIT_APPEND(self->str, j, len);
4239 }
4240 return list;
4241
4242 onError:
4243 Py_DECREF(list);
4244 return NULL;
4245}
4246
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004247static
4248PyObject *rsplit_whitespace(PyUnicodeObject *self,
4249 PyObject *list,
4250 int maxcount)
4251{
4252 register int i;
4253 register int j;
4254 int len = self->length;
4255 PyObject *str;
4256
4257 for (i = j = len - 1; i >= 0; ) {
4258 /* find a token */
4259 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4260 i--;
4261 j = i;
4262 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4263 i--;
4264 if (j > i) {
4265 if (maxcount-- <= 0)
4266 break;
4267 SPLIT_INSERT(self->str, i + 1, j + 1);
4268 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4269 i--;
4270 j = i;
4271 }
4272 }
4273 if (j >= 0) {
4274 SPLIT_INSERT(self->str, 0, j + 1);
4275 }
4276 return list;
4277
4278 onError:
4279 Py_DECREF(list);
4280 return NULL;
4281}
4282
4283static
4284PyObject *rsplit_char(PyUnicodeObject *self,
4285 PyObject *list,
4286 Py_UNICODE ch,
4287 int maxcount)
4288{
4289 register int i;
4290 register int j;
4291 int len = self->length;
4292 PyObject *str;
4293
4294 for (i = j = len - 1; i >= 0; ) {
4295 if (self->str[i] == ch) {
4296 if (maxcount-- <= 0)
4297 break;
4298 SPLIT_INSERT(self->str, i + 1, j + 1);
4299 j = i = i - 1;
4300 } else
4301 i--;
4302 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004303 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004304 SPLIT_INSERT(self->str, 0, j + 1);
4305 }
4306 return list;
4307
4308 onError:
4309 Py_DECREF(list);
4310 return NULL;
4311}
4312
4313static
4314PyObject *rsplit_substring(PyUnicodeObject *self,
4315 PyObject *list,
4316 PyUnicodeObject *substring,
4317 int maxcount)
4318{
4319 register int i;
4320 register int j;
4321 int len = self->length;
4322 int sublen = substring->length;
4323 PyObject *str;
4324
4325 for (i = len - sublen, j = len; i >= 0; ) {
4326 if (Py_UNICODE_MATCH(self, i, substring)) {
4327 if (maxcount-- <= 0)
4328 break;
4329 SPLIT_INSERT(self->str, i + sublen, j);
4330 j = i;
4331 i -= sublen;
4332 } else
4333 i--;
4334 }
4335 if (j >= 0) {
4336 SPLIT_INSERT(self->str, 0, j);
4337 }
4338 return list;
4339
4340 onError:
4341 Py_DECREF(list);
4342 return NULL;
4343}
4344
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004346#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004347
4348static
4349PyObject *split(PyUnicodeObject *self,
4350 PyUnicodeObject *substring,
4351 int maxcount)
4352{
4353 PyObject *list;
4354
4355 if (maxcount < 0)
4356 maxcount = INT_MAX;
4357
4358 list = PyList_New(0);
4359 if (!list)
4360 return NULL;
4361
4362 if (substring == NULL)
4363 return split_whitespace(self,list,maxcount);
4364
4365 else if (substring->length == 1)
4366 return split_char(self,list,substring->str[0],maxcount);
4367
4368 else if (substring->length == 0) {
4369 Py_DECREF(list);
4370 PyErr_SetString(PyExc_ValueError, "empty separator");
4371 return NULL;
4372 }
4373 else
4374 return split_substring(self,list,substring,maxcount);
4375}
4376
Tim Petersced69f82003-09-16 20:30:58 +00004377static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004378PyObject *rsplit(PyUnicodeObject *self,
4379 PyUnicodeObject *substring,
4380 int maxcount)
4381{
4382 PyObject *list;
4383
4384 if (maxcount < 0)
4385 maxcount = INT_MAX;
4386
4387 list = PyList_New(0);
4388 if (!list)
4389 return NULL;
4390
4391 if (substring == NULL)
4392 return rsplit_whitespace(self,list,maxcount);
4393
4394 else if (substring->length == 1)
4395 return rsplit_char(self,list,substring->str[0],maxcount);
4396
4397 else if (substring->length == 0) {
4398 Py_DECREF(list);
4399 PyErr_SetString(PyExc_ValueError, "empty separator");
4400 return NULL;
4401 }
4402 else
4403 return rsplit_substring(self,list,substring,maxcount);
4404}
4405
4406static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407PyObject *replace(PyUnicodeObject *self,
4408 PyUnicodeObject *str1,
4409 PyUnicodeObject *str2,
4410 int maxcount)
4411{
4412 PyUnicodeObject *u;
4413
4414 if (maxcount < 0)
4415 maxcount = INT_MAX;
4416
4417 if (str1->length == 1 && str2->length == 1) {
4418 int i;
4419
4420 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004421 if (!findchar(self->str, self->length, str1->str[0]) &&
4422 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423 /* nothing to replace, return original string */
4424 Py_INCREF(self);
4425 u = self;
4426 } else {
4427 Py_UNICODE u1 = str1->str[0];
4428 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004429
Guido van Rossumd57fd912000-03-10 22:53:23 +00004430 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004431 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432 self->length
4433 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004434 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004435 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004436 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437 for (i = 0; i < u->length; i++)
4438 if (u->str[i] == u1) {
4439 if (--maxcount < 0)
4440 break;
4441 u->str[i] = u2;
4442 }
4443 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004444 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445
4446 } else {
4447 int n, i;
4448 Py_UNICODE *p;
4449
4450 /* replace strings */
4451 n = count(self, 0, self->length, str1);
4452 if (n > maxcount)
4453 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004454 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004456 if (PyUnicode_CheckExact(self)) {
4457 Py_INCREF(self);
4458 u = self;
4459 }
4460 else {
4461 u = (PyUnicodeObject *)
4462 PyUnicode_FromUnicode(self->str, self->length);
4463 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004464 } else {
4465 u = _PyUnicode_New(
4466 self->length + n * (str2->length - str1->length));
4467 if (u) {
4468 i = 0;
4469 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004470 if (str1->length > 0) {
4471 while (i <= self->length - str1->length)
4472 if (Py_UNICODE_MATCH(self, i, str1)) {
4473 /* replace string segment */
4474 Py_UNICODE_COPY(p, str2->str, str2->length);
4475 p += str2->length;
4476 i += str1->length;
4477 if (--n <= 0) {
4478 /* copy remaining part */
4479 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4480 break;
4481 }
4482 } else
4483 *p++ = self->str[i++];
4484 } else {
4485 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486 Py_UNICODE_COPY(p, str2->str, str2->length);
4487 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004488 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004491 }
4492 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4493 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494 }
4495 }
4496 }
Tim Petersced69f82003-09-16 20:30:58 +00004497
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498 return (PyObject *) u;
4499}
4500
4501/* --- Unicode Object Methods --------------------------------------------- */
4502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004503PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504"S.title() -> unicode\n\
4505\n\
4506Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004507characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508
4509static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004510unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004511{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512 return fixup(self, fixtitle);
4513}
4514
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004515PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516"S.capitalize() -> unicode\n\
4517\n\
4518Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004519have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520
4521static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004522unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004523{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524 return fixup(self, fixcapitalize);
4525}
4526
4527#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004528PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004529"S.capwords() -> unicode\n\
4530\n\
4531Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004532normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004533
4534static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004535unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536{
4537 PyObject *list;
4538 PyObject *item;
4539 int i;
4540
Guido van Rossumd57fd912000-03-10 22:53:23 +00004541 /* Split into words */
4542 list = split(self, NULL, -1);
4543 if (!list)
4544 return NULL;
4545
4546 /* Capitalize each word */
4547 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4548 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4549 fixcapitalize);
4550 if (item == NULL)
4551 goto onError;
4552 Py_DECREF(PyList_GET_ITEM(list, i));
4553 PyList_SET_ITEM(list, i, item);
4554 }
4555
4556 /* Join the words to form a new string */
4557 item = PyUnicode_Join(NULL, list);
4558
4559onError:
4560 Py_DECREF(list);
4561 return (PyObject *)item;
4562}
4563#endif
4564
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004565/* Argument converter. Coerces to a single unicode character */
4566
4567static int
4568convert_uc(PyObject *obj, void *addr)
4569{
4570 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4571 PyObject *uniobj;
4572 Py_UNICODE *unistr;
4573
4574 uniobj = PyUnicode_FromObject(obj);
4575 if (uniobj == NULL) {
4576 PyErr_SetString(PyExc_TypeError,
4577 "The fill character cannot be converted to Unicode");
4578 return 0;
4579 }
4580 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4581 PyErr_SetString(PyExc_TypeError,
4582 "The fill character must be exactly one character long");
4583 Py_DECREF(uniobj);
4584 return 0;
4585 }
4586 unistr = PyUnicode_AS_UNICODE(uniobj);
4587 *fillcharloc = unistr[0];
4588 Py_DECREF(uniobj);
4589 return 1;
4590}
4591
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004592PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004593"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004594\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004595Return S centered in a Unicode string of length width. Padding is\n\
4596done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597
4598static PyObject *
4599unicode_center(PyUnicodeObject *self, PyObject *args)
4600{
4601 int marg, left;
4602 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004603 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004605 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004606 return NULL;
4607
Tim Peters7a29bd52001-09-12 03:03:31 +00004608 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004609 Py_INCREF(self);
4610 return (PyObject*) self;
4611 }
4612
4613 marg = width - self->length;
4614 left = marg / 2 + (marg & width & 1);
4615
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004616 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004617}
4618
Marc-André Lemburge5034372000-08-08 08:04:29 +00004619#if 0
4620
4621/* This code should go into some future Unicode collation support
4622 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004623 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004624
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004625/* speedy UTF-16 code point order comparison */
4626/* gleaned from: */
4627/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4628
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004629static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004630{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004631 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004632 0, 0, 0, 0, 0, 0, 0, 0,
4633 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004634 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004635};
4636
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637static int
4638unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4639{
4640 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004641
Guido van Rossumd57fd912000-03-10 22:53:23 +00004642 Py_UNICODE *s1 = str1->str;
4643 Py_UNICODE *s2 = str2->str;
4644
4645 len1 = str1->length;
4646 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004647
Guido van Rossumd57fd912000-03-10 22:53:23 +00004648 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004649 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004650
4651 c1 = *s1++;
4652 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004653
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004654 if (c1 > (1<<11) * 26)
4655 c1 += utf16Fixup[c1>>11];
4656 if (c2 > (1<<11) * 26)
4657 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004658 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004659
4660 if (c1 != c2)
4661 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004662
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004663 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664 }
4665
4666 return (len1 < len2) ? -1 : (len1 != len2);
4667}
4668
Marc-André Lemburge5034372000-08-08 08:04:29 +00004669#else
4670
4671static int
4672unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4673{
4674 register int len1, len2;
4675
4676 Py_UNICODE *s1 = str1->str;
4677 Py_UNICODE *s2 = str2->str;
4678
4679 len1 = str1->length;
4680 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004681
Marc-André Lemburge5034372000-08-08 08:04:29 +00004682 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004683 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004684
Fredrik Lundh45714e92001-06-26 16:39:36 +00004685 c1 = *s1++;
4686 c2 = *s2++;
4687
4688 if (c1 != c2)
4689 return (c1 < c2) ? -1 : 1;
4690
Marc-André Lemburge5034372000-08-08 08:04:29 +00004691 len1--; len2--;
4692 }
4693
4694 return (len1 < len2) ? -1 : (len1 != len2);
4695}
4696
4697#endif
4698
Guido van Rossumd57fd912000-03-10 22:53:23 +00004699int PyUnicode_Compare(PyObject *left,
4700 PyObject *right)
4701{
4702 PyUnicodeObject *u = NULL, *v = NULL;
4703 int result;
4704
4705 /* Coerce the two arguments */
4706 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4707 if (u == NULL)
4708 goto onError;
4709 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4710 if (v == NULL)
4711 goto onError;
4712
Thomas Wouters7e474022000-07-16 12:04:32 +00004713 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714 if (v == u) {
4715 Py_DECREF(u);
4716 Py_DECREF(v);
4717 return 0;
4718 }
4719
4720 result = unicode_compare(u, v);
4721
4722 Py_DECREF(u);
4723 Py_DECREF(v);
4724 return result;
4725
4726onError:
4727 Py_XDECREF(u);
4728 Py_XDECREF(v);
4729 return -1;
4730}
4731
Guido van Rossum403d68b2000-03-13 15:55:09 +00004732int PyUnicode_Contains(PyObject *container,
4733 PyObject *element)
4734{
4735 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004736 int result, size;
4737 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004738
4739 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004740 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004741 if (v == NULL) {
4742 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004743 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004744 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004745 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004746 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004747 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004748 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004749
Barry Warsaw817918c2002-08-06 16:58:21 +00004750 size = PyUnicode_GET_SIZE(v);
4751 rhs = PyUnicode_AS_UNICODE(v);
4752 lhs = PyUnicode_AS_UNICODE(u);
4753
Guido van Rossum403d68b2000-03-13 15:55:09 +00004754 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004755 if (size == 1) {
4756 end = lhs + PyUnicode_GET_SIZE(u);
4757 while (lhs < end) {
4758 if (*lhs++ == *rhs) {
4759 result = 1;
4760 break;
4761 }
4762 }
4763 }
4764 else {
4765 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4766 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004767 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004768 result = 1;
4769 break;
4770 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004771 }
4772 }
4773
4774 Py_DECREF(u);
4775 Py_DECREF(v);
4776 return result;
4777
4778onError:
4779 Py_XDECREF(u);
4780 Py_XDECREF(v);
4781 return -1;
4782}
4783
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784/* Concat to string or Unicode object giving a new Unicode object. */
4785
4786PyObject *PyUnicode_Concat(PyObject *left,
4787 PyObject *right)
4788{
4789 PyUnicodeObject *u = NULL, *v = NULL, *w;
4790
4791 /* Coerce the two arguments */
4792 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4793 if (u == NULL)
4794 goto onError;
4795 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4796 if (v == NULL)
4797 goto onError;
4798
4799 /* Shortcuts */
4800 if (v == unicode_empty) {
4801 Py_DECREF(v);
4802 return (PyObject *)u;
4803 }
4804 if (u == unicode_empty) {
4805 Py_DECREF(u);
4806 return (PyObject *)v;
4807 }
4808
4809 /* Concat the two Unicode strings */
4810 w = _PyUnicode_New(u->length + v->length);
4811 if (w == NULL)
4812 goto onError;
4813 Py_UNICODE_COPY(w->str, u->str, u->length);
4814 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4815
4816 Py_DECREF(u);
4817 Py_DECREF(v);
4818 return (PyObject *)w;
4819
4820onError:
4821 Py_XDECREF(u);
4822 Py_XDECREF(v);
4823 return NULL;
4824}
4825
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004826PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827"S.count(sub[, start[, end]]) -> int\n\
4828\n\
4829Return the number of occurrences of substring sub in Unicode string\n\
4830S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004831interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832
4833static PyObject *
4834unicode_count(PyUnicodeObject *self, PyObject *args)
4835{
4836 PyUnicodeObject *substring;
4837 int start = 0;
4838 int end = INT_MAX;
4839 PyObject *result;
4840
Guido van Rossumb8872e62000-05-09 14:14:27 +00004841 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4842 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 return NULL;
4844
4845 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4846 (PyObject *)substring);
4847 if (substring == NULL)
4848 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004849
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850 if (start < 0)
4851 start += self->length;
4852 if (start < 0)
4853 start = 0;
4854 if (end > self->length)
4855 end = self->length;
4856 if (end < 0)
4857 end += self->length;
4858 if (end < 0)
4859 end = 0;
4860
4861 result = PyInt_FromLong((long) count(self, start, end, substring));
4862
4863 Py_DECREF(substring);
4864 return result;
4865}
4866
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004867PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868"S.encode([encoding[,errors]]) -> string\n\
4869\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004870Return an encoded string version of S. Default encoding is the current\n\
4871default string encoding. errors may be given to set a different error\n\
4872handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004873a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4874'xmlcharrefreplace' as well as any other name registered with\n\
4875codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876
4877static PyObject *
4878unicode_encode(PyUnicodeObject *self, PyObject *args)
4879{
4880 char *encoding = NULL;
4881 char *errors = NULL;
4882 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4883 return NULL;
4884 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4885}
4886
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004887PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888"S.expandtabs([tabsize]) -> unicode\n\
4889\n\
4890Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004891If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892
4893static PyObject*
4894unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4895{
4896 Py_UNICODE *e;
4897 Py_UNICODE *p;
4898 Py_UNICODE *q;
4899 int i, j;
4900 PyUnicodeObject *u;
4901 int tabsize = 8;
4902
4903 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4904 return NULL;
4905
Thomas Wouters7e474022000-07-16 12:04:32 +00004906 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907 i = j = 0;
4908 e = self->str + self->length;
4909 for (p = self->str; p < e; p++)
4910 if (*p == '\t') {
4911 if (tabsize > 0)
4912 j += tabsize - (j % tabsize);
4913 }
4914 else {
4915 j++;
4916 if (*p == '\n' || *p == '\r') {
4917 i += j;
4918 j = 0;
4919 }
4920 }
4921
4922 /* Second pass: create output string and fill it */
4923 u = _PyUnicode_New(i + j);
4924 if (!u)
4925 return NULL;
4926
4927 j = 0;
4928 q = u->str;
4929
4930 for (p = self->str; p < e; p++)
4931 if (*p == '\t') {
4932 if (tabsize > 0) {
4933 i = tabsize - (j % tabsize);
4934 j += i;
4935 while (i--)
4936 *q++ = ' ';
4937 }
4938 }
4939 else {
4940 j++;
4941 *q++ = *p;
4942 if (*p == '\n' || *p == '\r')
4943 j = 0;
4944 }
4945
4946 return (PyObject*) u;
4947}
4948
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004949PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950"S.find(sub [,start [,end]]) -> int\n\
4951\n\
4952Return the lowest index in S where substring sub is found,\n\
4953such that sub is contained within s[start,end]. Optional\n\
4954arguments start and end are interpreted as in slice notation.\n\
4955\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004956Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004957
4958static PyObject *
4959unicode_find(PyUnicodeObject *self, PyObject *args)
4960{
4961 PyUnicodeObject *substring;
4962 int start = 0;
4963 int end = INT_MAX;
4964 PyObject *result;
4965
Guido van Rossumb8872e62000-05-09 14:14:27 +00004966 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4967 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968 return NULL;
4969 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4970 (PyObject *)substring);
4971 if (substring == NULL)
4972 return NULL;
4973
4974 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4975
4976 Py_DECREF(substring);
4977 return result;
4978}
4979
4980static PyObject *
4981unicode_getitem(PyUnicodeObject *self, int index)
4982{
4983 if (index < 0 || index >= self->length) {
4984 PyErr_SetString(PyExc_IndexError, "string index out of range");
4985 return NULL;
4986 }
4987
4988 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4989}
4990
4991static long
4992unicode_hash(PyUnicodeObject *self)
4993{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004994 /* Since Unicode objects compare equal to their ASCII string
4995 counterparts, they should use the individual character values
4996 as basis for their hash value. This is needed to assure that
4997 strings and Unicode objects behave in the same way as
4998 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004999
Fredrik Lundhdde61642000-07-10 18:27:47 +00005000 register int len;
5001 register Py_UNICODE *p;
5002 register long x;
5003
Guido van Rossumd57fd912000-03-10 22:53:23 +00005004 if (self->hash != -1)
5005 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005006 len = PyUnicode_GET_SIZE(self);
5007 p = PyUnicode_AS_UNICODE(self);
5008 x = *p << 7;
5009 while (--len >= 0)
5010 x = (1000003*x) ^ *p++;
5011 x ^= PyUnicode_GET_SIZE(self);
5012 if (x == -1)
5013 x = -2;
5014 self->hash = x;
5015 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016}
5017
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005018PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005019"S.index(sub [,start [,end]]) -> int\n\
5020\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005021Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005022
5023static PyObject *
5024unicode_index(PyUnicodeObject *self, PyObject *args)
5025{
5026 int result;
5027 PyUnicodeObject *substring;
5028 int start = 0;
5029 int end = INT_MAX;
5030
Guido van Rossumb8872e62000-05-09 14:14:27 +00005031 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5032 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005033 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005034
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5036 (PyObject *)substring);
5037 if (substring == NULL)
5038 return NULL;
5039
5040 result = findstring(self, substring, start, end, 1);
5041
5042 Py_DECREF(substring);
5043 if (result < 0) {
5044 PyErr_SetString(PyExc_ValueError, "substring not found");
5045 return NULL;
5046 }
5047 return PyInt_FromLong(result);
5048}
5049
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005050PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005051"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005053Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005054at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055
5056static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005057unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058{
5059 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5060 register const Py_UNICODE *e;
5061 int cased;
5062
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063 /* Shortcut for single character strings */
5064 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005065 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005067 /* Special case for empty strings */
5068 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005069 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005070
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071 e = p + PyUnicode_GET_SIZE(self);
5072 cased = 0;
5073 for (; p < e; p++) {
5074 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005075
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005077 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078 else if (!cased && Py_UNICODE_ISLOWER(ch))
5079 cased = 1;
5080 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005081 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082}
5083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005084PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005085"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005087Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005088at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089
5090static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005091unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092{
5093 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5094 register const Py_UNICODE *e;
5095 int cased;
5096
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097 /* Shortcut for single character strings */
5098 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005099 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005100
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005101 /* Special case for empty strings */
5102 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005103 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005104
Guido van Rossumd57fd912000-03-10 22:53:23 +00005105 e = p + PyUnicode_GET_SIZE(self);
5106 cased = 0;
5107 for (; p < e; p++) {
5108 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005109
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005111 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112 else if (!cased && Py_UNICODE_ISUPPER(ch))
5113 cased = 1;
5114 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005115 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116}
5117
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005118PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005119"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005121Return True if S is a titlecased string and there is at least one\n\
5122character in S, i.e. upper- and titlecase characters may only\n\
5123follow uncased characters and lowercase characters only cased ones.\n\
5124Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125
5126static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005127unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005128{
5129 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5130 register const Py_UNICODE *e;
5131 int cased, previous_is_cased;
5132
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133 /* Shortcut for single character strings */
5134 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005135 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5136 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005138 /* Special case for empty strings */
5139 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005140 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005141
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142 e = p + PyUnicode_GET_SIZE(self);
5143 cased = 0;
5144 previous_is_cased = 0;
5145 for (; p < e; p++) {
5146 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005147
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5149 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005150 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 previous_is_cased = 1;
5152 cased = 1;
5153 }
5154 else if (Py_UNICODE_ISLOWER(ch)) {
5155 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005156 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157 previous_is_cased = 1;
5158 cased = 1;
5159 }
5160 else
5161 previous_is_cased = 0;
5162 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005163 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164}
5165
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005166PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005167"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005169Return True if all characters in S are whitespace\n\
5170and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171
5172static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005173unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174{
5175 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5176 register const Py_UNICODE *e;
5177
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 /* Shortcut for single character strings */
5179 if (PyUnicode_GET_SIZE(self) == 1 &&
5180 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005181 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005183 /* Special case for empty strings */
5184 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005185 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005186
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187 e = p + PyUnicode_GET_SIZE(self);
5188 for (; p < e; p++) {
5189 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005190 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005192 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193}
5194
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005195PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005196"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005197\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005198Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005199and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005200
5201static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005202unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005203{
5204 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5205 register const Py_UNICODE *e;
5206
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005207 /* Shortcut for single character strings */
5208 if (PyUnicode_GET_SIZE(self) == 1 &&
5209 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005210 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005211
5212 /* Special case for empty strings */
5213 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005214 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005215
5216 e = p + PyUnicode_GET_SIZE(self);
5217 for (; p < e; p++) {
5218 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005219 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005220 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005221 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005222}
5223
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005224PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005225"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005226\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005227Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005228and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005229
5230static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005231unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005232{
5233 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5234 register const Py_UNICODE *e;
5235
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005236 /* Shortcut for single character strings */
5237 if (PyUnicode_GET_SIZE(self) == 1 &&
5238 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005239 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005240
5241 /* Special case for empty strings */
5242 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005243 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005244
5245 e = p + PyUnicode_GET_SIZE(self);
5246 for (; p < e; p++) {
5247 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005248 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005249 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005250 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005251}
5252
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005253PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005254"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005256Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005257False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258
5259static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005260unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261{
5262 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5263 register const Py_UNICODE *e;
5264
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265 /* Shortcut for single character strings */
5266 if (PyUnicode_GET_SIZE(self) == 1 &&
5267 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005268 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005270 /* Special case for empty strings */
5271 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005272 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005273
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274 e = p + PyUnicode_GET_SIZE(self);
5275 for (; p < e; p++) {
5276 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005277 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005279 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280}
5281
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005282PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005283"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005285Return True if all characters in S are digits\n\
5286and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287
5288static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005289unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290{
5291 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5292 register const Py_UNICODE *e;
5293
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294 /* Shortcut for single character strings */
5295 if (PyUnicode_GET_SIZE(self) == 1 &&
5296 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005297 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005299 /* Special case for empty strings */
5300 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005301 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005302
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 e = p + PyUnicode_GET_SIZE(self);
5304 for (; p < e; p++) {
5305 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005306 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005308 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309}
5310
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005311PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005312"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005314Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005315False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316
5317static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005318unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319{
5320 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5321 register const Py_UNICODE *e;
5322
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323 /* Shortcut for single character strings */
5324 if (PyUnicode_GET_SIZE(self) == 1 &&
5325 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005326 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005328 /* Special case for empty strings */
5329 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005330 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005331
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 e = p + PyUnicode_GET_SIZE(self);
5333 for (; p < e; p++) {
5334 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005335 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005337 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338}
5339
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005340PyDoc_STRVAR(iswide__doc__,
5341"S.iswide() -> bool\n\
5342\n\
5343Return True if all characters in S are wide width\n\
5344and there is at least one character in S, False otherwise.");
5345
5346static PyObject*
5347unicode_iswide(PyUnicodeObject *self)
5348{
5349 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5350 register const Py_UNICODE *e;
5351
5352 /* Shortcut for single character strings */
5353 if (PyUnicode_GET_SIZE(self) == 1 &&
5354 Py_UNICODE_ISWIDE(*p))
5355 Py_RETURN_TRUE;
5356
5357 /* Special case for empty strings */
5358 if (PyString_GET_SIZE(self) == 0)
5359 Py_RETURN_FALSE;
5360
5361 e = p + PyUnicode_GET_SIZE(self);
5362 for (; p < e; p++) {
5363 if (!Py_UNICODE_ISWIDE(*p))
5364 Py_RETURN_FALSE;
5365 }
5366 Py_RETURN_TRUE;
5367}
5368
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005369PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370"S.join(sequence) -> unicode\n\
5371\n\
5372Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005373sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374
5375static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005376unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005378 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379}
5380
5381static int
5382unicode_length(PyUnicodeObject *self)
5383{
5384 return self->length;
5385}
5386
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005387PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005388"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389\n\
5390Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005391done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392
5393static PyObject *
5394unicode_ljust(PyUnicodeObject *self, PyObject *args)
5395{
5396 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005397 Py_UNICODE fillchar = ' ';
5398
5399 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 return NULL;
5401
Tim Peters7a29bd52001-09-12 03:03:31 +00005402 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 Py_INCREF(self);
5404 return (PyObject*) self;
5405 }
5406
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005407 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408}
5409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005410PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411"S.lower() -> unicode\n\
5412\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005413Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414
5415static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005416unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 return fixup(self, fixlower);
5419}
5420
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005421#define LEFTSTRIP 0
5422#define RIGHTSTRIP 1
5423#define BOTHSTRIP 2
5424
5425/* Arrays indexed by above */
5426static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5427
5428#define STRIPNAME(i) (stripformat[i]+3)
5429
5430static const Py_UNICODE *
5431unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5432{
Tim Peters030a5ce2002-04-22 19:00:10 +00005433 size_t i;
5434 for (i = 0; i < n; ++i)
5435 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005436 return s+i;
5437 return NULL;
5438}
5439
5440/* externally visible for str.strip(unicode) */
5441PyObject *
5442_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5443{
5444 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5445 int len = PyUnicode_GET_SIZE(self);
5446 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5447 int seplen = PyUnicode_GET_SIZE(sepobj);
5448 int i, j;
5449
5450 i = 0;
5451 if (striptype != RIGHTSTRIP) {
5452 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5453 i++;
5454 }
5455 }
5456
5457 j = len;
5458 if (striptype != LEFTSTRIP) {
5459 do {
5460 j--;
5461 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5462 j++;
5463 }
5464
5465 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5466 Py_INCREF(self);
5467 return (PyObject*)self;
5468 }
5469 else
5470 return PyUnicode_FromUnicode(s+i, j-i);
5471}
5472
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473
5474static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005475do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005477 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5478 int len = PyUnicode_GET_SIZE(self), i, j;
5479
5480 i = 0;
5481 if (striptype != RIGHTSTRIP) {
5482 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5483 i++;
5484 }
5485 }
5486
5487 j = len;
5488 if (striptype != LEFTSTRIP) {
5489 do {
5490 j--;
5491 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5492 j++;
5493 }
5494
5495 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5496 Py_INCREF(self);
5497 return (PyObject*)self;
5498 }
5499 else
5500 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501}
5502
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005503
5504static PyObject *
5505do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5506{
5507 PyObject *sep = NULL;
5508
5509 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5510 return NULL;
5511
5512 if (sep != NULL && sep != Py_None) {
5513 if (PyUnicode_Check(sep))
5514 return _PyUnicode_XStrip(self, striptype, sep);
5515 else if (PyString_Check(sep)) {
5516 PyObject *res;
5517 sep = PyUnicode_FromObject(sep);
5518 if (sep==NULL)
5519 return NULL;
5520 res = _PyUnicode_XStrip(self, striptype, sep);
5521 Py_DECREF(sep);
5522 return res;
5523 }
5524 else {
5525 PyErr_Format(PyExc_TypeError,
5526 "%s arg must be None, unicode or str",
5527 STRIPNAME(striptype));
5528 return NULL;
5529 }
5530 }
5531
5532 return do_strip(self, striptype);
5533}
5534
5535
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005536PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005537"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005538\n\
5539Return a copy of the string S with leading and trailing\n\
5540whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005541If chars is given and not None, remove characters in chars instead.\n\
5542If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005543
5544static PyObject *
5545unicode_strip(PyUnicodeObject *self, PyObject *args)
5546{
5547 if (PyTuple_GET_SIZE(args) == 0)
5548 return do_strip(self, BOTHSTRIP); /* Common case */
5549 else
5550 return do_argstrip(self, BOTHSTRIP, args);
5551}
5552
5553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005554PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005555"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005556\n\
5557Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005558If chars is given and not None, remove characters in chars instead.\n\
5559If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005560
5561static PyObject *
5562unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5563{
5564 if (PyTuple_GET_SIZE(args) == 0)
5565 return do_strip(self, LEFTSTRIP); /* Common case */
5566 else
5567 return do_argstrip(self, LEFTSTRIP, args);
5568}
5569
5570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005571PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005572"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005573\n\
5574Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005575If chars is given and not None, remove characters in chars instead.\n\
5576If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005577
5578static PyObject *
5579unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5580{
5581 if (PyTuple_GET_SIZE(args) == 0)
5582 return do_strip(self, RIGHTSTRIP); /* Common case */
5583 else
5584 return do_argstrip(self, RIGHTSTRIP, args);
5585}
5586
5587
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588static PyObject*
5589unicode_repeat(PyUnicodeObject *str, int len)
5590{
5591 PyUnicodeObject *u;
5592 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005593 int nchars;
5594 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595
5596 if (len < 0)
5597 len = 0;
5598
Tim Peters7a29bd52001-09-12 03:03:31 +00005599 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600 /* no repeat, return original string */
5601 Py_INCREF(str);
5602 return (PyObject*) str;
5603 }
Tim Peters8f422462000-09-09 06:13:41 +00005604
5605 /* ensure # of chars needed doesn't overflow int and # of bytes
5606 * needed doesn't overflow size_t
5607 */
5608 nchars = len * str->length;
5609 if (len && nchars / len != str->length) {
5610 PyErr_SetString(PyExc_OverflowError,
5611 "repeated string is too long");
5612 return NULL;
5613 }
5614 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5615 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5616 PyErr_SetString(PyExc_OverflowError,
5617 "repeated string is too long");
5618 return NULL;
5619 }
5620 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 if (!u)
5622 return NULL;
5623
5624 p = u->str;
5625
5626 while (len-- > 0) {
5627 Py_UNICODE_COPY(p, str->str, str->length);
5628 p += str->length;
5629 }
5630
5631 return (PyObject*) u;
5632}
5633
5634PyObject *PyUnicode_Replace(PyObject *obj,
5635 PyObject *subobj,
5636 PyObject *replobj,
5637 int maxcount)
5638{
5639 PyObject *self;
5640 PyObject *str1;
5641 PyObject *str2;
5642 PyObject *result;
5643
5644 self = PyUnicode_FromObject(obj);
5645 if (self == NULL)
5646 return NULL;
5647 str1 = PyUnicode_FromObject(subobj);
5648 if (str1 == NULL) {
5649 Py_DECREF(self);
5650 return NULL;
5651 }
5652 str2 = PyUnicode_FromObject(replobj);
5653 if (str2 == NULL) {
5654 Py_DECREF(self);
5655 Py_DECREF(str1);
5656 return NULL;
5657 }
Tim Petersced69f82003-09-16 20:30:58 +00005658 result = replace((PyUnicodeObject *)self,
5659 (PyUnicodeObject *)str1,
5660 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 maxcount);
5662 Py_DECREF(self);
5663 Py_DECREF(str1);
5664 Py_DECREF(str2);
5665 return result;
5666}
5667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005668PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669"S.replace (old, new[, maxsplit]) -> unicode\n\
5670\n\
5671Return a copy of S with all occurrences of substring\n\
5672old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005673given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674
5675static PyObject*
5676unicode_replace(PyUnicodeObject *self, PyObject *args)
5677{
5678 PyUnicodeObject *str1;
5679 PyUnicodeObject *str2;
5680 int maxcount = -1;
5681 PyObject *result;
5682
5683 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5684 return NULL;
5685 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5686 if (str1 == NULL)
5687 return NULL;
5688 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005689 if (str2 == NULL) {
5690 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005692 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693
5694 result = replace(self, str1, str2, maxcount);
5695
5696 Py_DECREF(str1);
5697 Py_DECREF(str2);
5698 return result;
5699}
5700
5701static
5702PyObject *unicode_repr(PyObject *unicode)
5703{
5704 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5705 PyUnicode_GET_SIZE(unicode),
5706 1);
5707}
5708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005709PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710"S.rfind(sub [,start [,end]]) -> int\n\
5711\n\
5712Return the highest index in S where substring sub is found,\n\
5713such that sub is contained within s[start,end]. Optional\n\
5714arguments start and end are interpreted as in slice notation.\n\
5715\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005716Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717
5718static PyObject *
5719unicode_rfind(PyUnicodeObject *self, PyObject *args)
5720{
5721 PyUnicodeObject *substring;
5722 int start = 0;
5723 int end = INT_MAX;
5724 PyObject *result;
5725
Guido van Rossumb8872e62000-05-09 14:14:27 +00005726 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5727 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 return NULL;
5729 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5730 (PyObject *)substring);
5731 if (substring == NULL)
5732 return NULL;
5733
5734 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5735
5736 Py_DECREF(substring);
5737 return result;
5738}
5739
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005740PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741"S.rindex(sub [,start [,end]]) -> int\n\
5742\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005743Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744
5745static PyObject *
5746unicode_rindex(PyUnicodeObject *self, PyObject *args)
5747{
5748 int result;
5749 PyUnicodeObject *substring;
5750 int start = 0;
5751 int end = INT_MAX;
5752
Guido van Rossumb8872e62000-05-09 14:14:27 +00005753 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5754 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 return NULL;
5756 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5757 (PyObject *)substring);
5758 if (substring == NULL)
5759 return NULL;
5760
5761 result = findstring(self, substring, start, end, -1);
5762
5763 Py_DECREF(substring);
5764 if (result < 0) {
5765 PyErr_SetString(PyExc_ValueError, "substring not found");
5766 return NULL;
5767 }
5768 return PyInt_FromLong(result);
5769}
5770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005771PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005772"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773\n\
5774Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005775done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776
5777static PyObject *
5778unicode_rjust(PyUnicodeObject *self, PyObject *args)
5779{
5780 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005781 Py_UNICODE fillchar = ' ';
5782
5783 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784 return NULL;
5785
Tim Peters7a29bd52001-09-12 03:03:31 +00005786 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 Py_INCREF(self);
5788 return (PyObject*) self;
5789 }
5790
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005791 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792}
5793
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794static PyObject*
5795unicode_slice(PyUnicodeObject *self, int start, int end)
5796{
5797 /* standard clamping */
5798 if (start < 0)
5799 start = 0;
5800 if (end < 0)
5801 end = 0;
5802 if (end > self->length)
5803 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005804 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805 /* full slice, return original string */
5806 Py_INCREF(self);
5807 return (PyObject*) self;
5808 }
5809 if (start > end)
5810 start = end;
5811 /* copy slice */
5812 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5813 end - start);
5814}
5815
5816PyObject *PyUnicode_Split(PyObject *s,
5817 PyObject *sep,
5818 int maxsplit)
5819{
5820 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005821
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822 s = PyUnicode_FromObject(s);
5823 if (s == NULL)
5824 return NULL;
5825 if (sep != NULL) {
5826 sep = PyUnicode_FromObject(sep);
5827 if (sep == NULL) {
5828 Py_DECREF(s);
5829 return NULL;
5830 }
5831 }
5832
5833 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5834
5835 Py_DECREF(s);
5836 Py_XDECREF(sep);
5837 return result;
5838}
5839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005840PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841"S.split([sep [,maxsplit]]) -> list of strings\n\
5842\n\
5843Return a list of the words in S, using sep as the\n\
5844delimiter string. If maxsplit is given, at most maxsplit\n\
5845splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005846is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847
5848static PyObject*
5849unicode_split(PyUnicodeObject *self, PyObject *args)
5850{
5851 PyObject *substring = Py_None;
5852 int maxcount = -1;
5853
5854 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5855 return NULL;
5856
5857 if (substring == Py_None)
5858 return split(self, NULL, maxcount);
5859 else if (PyUnicode_Check(substring))
5860 return split(self, (PyUnicodeObject *)substring, maxcount);
5861 else
5862 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5863}
5864
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005865PyObject *PyUnicode_RSplit(PyObject *s,
5866 PyObject *sep,
5867 int maxsplit)
5868{
5869 PyObject *result;
5870
5871 s = PyUnicode_FromObject(s);
5872 if (s == NULL)
5873 return NULL;
5874 if (sep != NULL) {
5875 sep = PyUnicode_FromObject(sep);
5876 if (sep == NULL) {
5877 Py_DECREF(s);
5878 return NULL;
5879 }
5880 }
5881
5882 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5883
5884 Py_DECREF(s);
5885 Py_XDECREF(sep);
5886 return result;
5887}
5888
5889PyDoc_STRVAR(rsplit__doc__,
5890"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
5891\n\
5892Return a list of the words in S, using sep as the\n\
5893delimiter string, starting at the end of the string and\n\
5894working to the front. If maxsplit is given, at most maxsplit\n\
5895splits are done. If sep is not specified, any whitespace string\n\
5896is a separator.");
5897
5898static PyObject*
5899unicode_rsplit(PyUnicodeObject *self, PyObject *args)
5900{
5901 PyObject *substring = Py_None;
5902 int maxcount = -1;
5903
5904 if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
5905 return NULL;
5906
5907 if (substring == Py_None)
5908 return rsplit(self, NULL, maxcount);
5909 else if (PyUnicode_Check(substring))
5910 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
5911 else
5912 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
5913}
5914
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005915PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00005916"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917\n\
5918Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00005919Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005920is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921
5922static PyObject*
5923unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5924{
Guido van Rossum86662912000-04-11 15:38:46 +00005925 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926
Guido van Rossum86662912000-04-11 15:38:46 +00005927 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 return NULL;
5929
Guido van Rossum86662912000-04-11 15:38:46 +00005930 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931}
5932
5933static
5934PyObject *unicode_str(PyUnicodeObject *self)
5935{
Fred Drakee4315f52000-05-09 19:53:39 +00005936 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937}
5938
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005939PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940"S.swapcase() -> unicode\n\
5941\n\
5942Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005943and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944
5945static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005946unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 return fixup(self, fixswapcase);
5949}
5950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005951PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952"S.translate(table) -> unicode\n\
5953\n\
5954Return a copy of the string S, where all characters have been mapped\n\
5955through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00005956Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5957Unmapped characters are left untouched. Characters mapped to None\n\
5958are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959
5960static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005961unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962{
Tim Petersced69f82003-09-16 20:30:58 +00005963 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00005965 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 "ignore");
5967}
5968
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005969PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970"S.upper() -> unicode\n\
5971\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005972Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973
5974static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005975unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 return fixup(self, fixupper);
5978}
5979
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005980PyDoc_STRVAR(width__doc__,
5981"S.width() -> unicode\n\
5982\n\
5983Return a fixed-width representation length of S.");
5984
5985static PyObject*
5986unicode_width(PyObject *self)
5987{
5988 int width = PyUnicode_GetWidth(self);
5989 if (width == -1)
5990 return NULL;
5991 else
5992 return PyInt_FromLong((long)width);
5993}
5994
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005995PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996"S.zfill(width) -> unicode\n\
5997\n\
5998Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005999of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000
6001static PyObject *
6002unicode_zfill(PyUnicodeObject *self, PyObject *args)
6003{
6004 int fill;
6005 PyUnicodeObject *u;
6006
6007 int width;
6008 if (!PyArg_ParseTuple(args, "i:zfill", &width))
6009 return NULL;
6010
6011 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006012 if (PyUnicode_CheckExact(self)) {
6013 Py_INCREF(self);
6014 return (PyObject*) self;
6015 }
6016 else
6017 return PyUnicode_FromUnicode(
6018 PyUnicode_AS_UNICODE(self),
6019 PyUnicode_GET_SIZE(self)
6020 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 }
6022
6023 fill = width - self->length;
6024
6025 u = pad(self, fill, 0, '0');
6026
Walter Dörwald068325e2002-04-15 13:36:47 +00006027 if (u == NULL)
6028 return NULL;
6029
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 if (u->str[fill] == '+' || u->str[fill] == '-') {
6031 /* move sign to beginning of string */
6032 u->str[0] = u->str[fill];
6033 u->str[fill] = '0';
6034 }
6035
6036 return (PyObject*) u;
6037}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038
6039#if 0
6040static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006041unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043 return PyInt_FromLong(unicode_freelist_size);
6044}
6045#endif
6046
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006047PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006048"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006050Return True if S starts with the specified prefix, False otherwise.\n\
6051With optional start, test S beginning at that position.\n\
6052With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053
6054static PyObject *
6055unicode_startswith(PyUnicodeObject *self,
6056 PyObject *args)
6057{
6058 PyUnicodeObject *substring;
6059 int start = 0;
6060 int end = INT_MAX;
6061 PyObject *result;
6062
Guido van Rossumb8872e62000-05-09 14:14:27 +00006063 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6064 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065 return NULL;
6066 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6067 (PyObject *)substring);
6068 if (substring == NULL)
6069 return NULL;
6070
Guido van Rossum77f6a652002-04-03 22:41:51 +00006071 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072
6073 Py_DECREF(substring);
6074 return result;
6075}
6076
6077
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006078PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006079"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006081Return True if S ends with the specified suffix, False otherwise.\n\
6082With optional start, test S beginning at that position.\n\
6083With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084
6085static PyObject *
6086unicode_endswith(PyUnicodeObject *self,
6087 PyObject *args)
6088{
6089 PyUnicodeObject *substring;
6090 int start = 0;
6091 int end = INT_MAX;
6092 PyObject *result;
6093
Guido van Rossumb8872e62000-05-09 14:14:27 +00006094 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6095 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 return NULL;
6097 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6098 (PyObject *)substring);
6099 if (substring == NULL)
6100 return NULL;
6101
Guido van Rossum77f6a652002-04-03 22:41:51 +00006102 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103
6104 Py_DECREF(substring);
6105 return result;
6106}
6107
6108
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006109
6110static PyObject *
6111unicode_getnewargs(PyUnicodeObject *v)
6112{
6113 return Py_BuildValue("(u#)", v->str, v->length);
6114}
6115
6116
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117static PyMethodDef unicode_methods[] = {
6118
6119 /* Order is according to common usage: often used methods should
6120 appear first, since lookup is done sequentially. */
6121
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006122 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6123 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6124 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006125 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006126 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6127 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6128 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6129 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6130 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6131 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6132 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6133 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6134 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6135 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006136 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006137/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6138 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6139 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6140 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006141 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006142 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006143 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006144 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6145 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6146 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6147 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6148 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6149 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6150 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6151 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6152 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6153 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6154 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6155 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6156 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6157 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006158 {"iswide", (PyCFunction) unicode_iswide, METH_NOARGS, iswide__doc__},
6159 {"width", (PyCFunction) unicode_width, METH_NOARGS, width__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006160 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006161#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006162 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163#endif
6164
6165#if 0
6166 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006167 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168#endif
6169
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006170 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 {NULL, NULL}
6172};
6173
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006174static PyObject *
6175unicode_mod(PyObject *v, PyObject *w)
6176{
6177 if (!PyUnicode_Check(v)) {
6178 Py_INCREF(Py_NotImplemented);
6179 return Py_NotImplemented;
6180 }
6181 return PyUnicode_Format(v, w);
6182}
6183
6184static PyNumberMethods unicode_as_number = {
6185 0, /*nb_add*/
6186 0, /*nb_subtract*/
6187 0, /*nb_multiply*/
6188 0, /*nb_divide*/
6189 unicode_mod, /*nb_remainder*/
6190};
6191
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192static PySequenceMethods unicode_as_sequence = {
6193 (inquiry) unicode_length, /* sq_length */
6194 (binaryfunc) PyUnicode_Concat, /* sq_concat */
6195 (intargfunc) unicode_repeat, /* sq_repeat */
6196 (intargfunc) unicode_getitem, /* sq_item */
6197 (intintargfunc) unicode_slice, /* sq_slice */
6198 0, /* sq_ass_item */
6199 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00006200 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201};
6202
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006203static PyObject*
6204unicode_subscript(PyUnicodeObject* self, PyObject* item)
6205{
6206 if (PyInt_Check(item)) {
6207 long i = PyInt_AS_LONG(item);
6208 if (i < 0)
6209 i += PyString_GET_SIZE(self);
6210 return unicode_getitem(self, i);
6211 } else if (PyLong_Check(item)) {
6212 long i = PyLong_AsLong(item);
6213 if (i == -1 && PyErr_Occurred())
6214 return NULL;
6215 if (i < 0)
6216 i += PyString_GET_SIZE(self);
6217 return unicode_getitem(self, i);
6218 } else if (PySlice_Check(item)) {
6219 int start, stop, step, slicelength, cur, i;
6220 Py_UNICODE* source_buf;
6221 Py_UNICODE* result_buf;
6222 PyObject* result;
6223
6224 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
6225 &start, &stop, &step, &slicelength) < 0) {
6226 return NULL;
6227 }
6228
6229 if (slicelength <= 0) {
6230 return PyUnicode_FromUnicode(NULL, 0);
6231 } else {
6232 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6233 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
6234
6235 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6236 result_buf[i] = source_buf[cur];
6237 }
Tim Petersced69f82003-09-16 20:30:58 +00006238
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006239 result = PyUnicode_FromUnicode(result_buf, slicelength);
6240 PyMem_FREE(result_buf);
6241 return result;
6242 }
6243 } else {
6244 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6245 return NULL;
6246 }
6247}
6248
6249static PyMappingMethods unicode_as_mapping = {
6250 (inquiry)unicode_length, /* mp_length */
6251 (binaryfunc)unicode_subscript, /* mp_subscript */
6252 (objobjargproc)0, /* mp_ass_subscript */
6253};
6254
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255static int
6256unicode_buffer_getreadbuf(PyUnicodeObject *self,
6257 int index,
6258 const void **ptr)
6259{
6260 if (index != 0) {
6261 PyErr_SetString(PyExc_SystemError,
6262 "accessing non-existent unicode segment");
6263 return -1;
6264 }
6265 *ptr = (void *) self->str;
6266 return PyUnicode_GET_DATA_SIZE(self);
6267}
6268
6269static int
6270unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6271 const void **ptr)
6272{
6273 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006274 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 return -1;
6276}
6277
6278static int
6279unicode_buffer_getsegcount(PyUnicodeObject *self,
6280 int *lenp)
6281{
6282 if (lenp)
6283 *lenp = PyUnicode_GET_DATA_SIZE(self);
6284 return 1;
6285}
6286
6287static int
6288unicode_buffer_getcharbuf(PyUnicodeObject *self,
6289 int index,
6290 const void **ptr)
6291{
6292 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006293
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 if (index != 0) {
6295 PyErr_SetString(PyExc_SystemError,
6296 "accessing non-existent unicode segment");
6297 return -1;
6298 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006299 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 if (str == NULL)
6301 return -1;
6302 *ptr = (void *) PyString_AS_STRING(str);
6303 return PyString_GET_SIZE(str);
6304}
6305
6306/* Helpers for PyUnicode_Format() */
6307
6308static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006309getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310{
6311 int argidx = *p_argidx;
6312 if (argidx < arglen) {
6313 (*p_argidx)++;
6314 if (arglen < 0)
6315 return args;
6316 else
6317 return PyTuple_GetItem(args, argidx);
6318 }
6319 PyErr_SetString(PyExc_TypeError,
6320 "not enough arguments for format string");
6321 return NULL;
6322}
6323
6324#define F_LJUST (1<<0)
6325#define F_SIGN (1<<1)
6326#define F_BLANK (1<<2)
6327#define F_ALT (1<<3)
6328#define F_ZERO (1<<4)
6329
6330static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332{
6333 register int i;
6334 int len;
6335 va_list va;
6336 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338
6339 /* First, format the string as char array, then expand to Py_UNICODE
6340 array. */
6341 charbuffer = (char *)buffer;
6342 len = vsprintf(charbuffer, format, va);
6343 for (i = len - 1; i >= 0; i--)
6344 buffer[i] = (Py_UNICODE) charbuffer[i];
6345
6346 va_end(va);
6347 return len;
6348}
6349
Guido van Rossum078151d2002-08-11 04:24:12 +00006350/* XXX To save some code duplication, formatfloat/long/int could have been
6351 shared with stringobject.c, converting from 8-bit to Unicode after the
6352 formatting is done. */
6353
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354static int
6355formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006356 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 int flags,
6358 int prec,
6359 int type,
6360 PyObject *v)
6361{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006362 /* fmt = '%#.' + `prec` + `type`
6363 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364 char fmt[20];
6365 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006366
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367 x = PyFloat_AsDouble(v);
6368 if (x == -1.0 && PyErr_Occurred())
6369 return -1;
6370 if (prec < 0)
6371 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6373 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006374 /* Worst case length calc to ensure no buffer overrun:
6375
6376 'g' formats:
6377 fmt = %#.<prec>g
6378 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6379 for any double rep.)
6380 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6381
6382 'f' formats:
6383 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6384 len = 1 + 50 + 1 + prec = 52 + prec
6385
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006386 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006387 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006388
6389 */
6390 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6391 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006392 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006393 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006394 return -1;
6395 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006396 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6397 (flags&F_ALT) ? "#" : "",
6398 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 return usprintf(buf, fmt, x);
6400}
6401
Tim Peters38fd5b62000-09-21 05:43:11 +00006402static PyObject*
6403formatlong(PyObject *val, int flags, int prec, int type)
6404{
6405 char *buf;
6406 int i, len;
6407 PyObject *str; /* temporary string object. */
6408 PyUnicodeObject *result;
6409
6410 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6411 if (!str)
6412 return NULL;
6413 result = _PyUnicode_New(len);
6414 for (i = 0; i < len; i++)
6415 result->str[i] = buf[i];
6416 result->str[len] = 0;
6417 Py_DECREF(str);
6418 return (PyObject*)result;
6419}
6420
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421static int
6422formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006423 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 int flags,
6425 int prec,
6426 int type,
6427 PyObject *v)
6428{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006429 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006430 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6431 * + 1 + 1
6432 * = 24
6433 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006434 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006435 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 long x;
6437
6438 x = PyInt_AsLong(v);
6439 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006440 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006441 if (x < 0 && type == 'u') {
6442 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006443 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006444 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6445 sign = "-";
6446 else
6447 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006449 prec = 1;
6450
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006451 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6452 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006453 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006454 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006455 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006456 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006457 return -1;
6458 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006459
6460 if ((flags & F_ALT) &&
6461 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006462 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006463 * of issues that cause pain:
6464 * - when 0 is being converted, the C standard leaves off
6465 * the '0x' or '0X', which is inconsistent with other
6466 * %#x/%#X conversions and inconsistent with Python's
6467 * hex() function
6468 * - there are platforms that violate the standard and
6469 * convert 0 with the '0x' or '0X'
6470 * (Metrowerks, Compaq Tru64)
6471 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006472 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006473 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006474 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006475 * We can achieve the desired consistency by inserting our
6476 * own '0x' or '0X' prefix, and substituting %x/%X in place
6477 * of %#x/%#X.
6478 *
6479 * Note that this is the same approach as used in
6480 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006481 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006482 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6483 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006484 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006485 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006486 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6487 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006488 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006489 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006490 if (sign[0])
6491 return usprintf(buf, fmt, -x);
6492 else
6493 return usprintf(buf, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494}
6495
6496static int
6497formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006498 size_t buflen,
6499 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006501 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006502 if (PyUnicode_Check(v)) {
6503 if (PyUnicode_GET_SIZE(v) != 1)
6504 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006506 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006508 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006509 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006510 goto onError;
6511 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513
6514 else {
6515 /* Integer input truncated to a character */
6516 long x;
6517 x = PyInt_AsLong(v);
6518 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006519 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006520#ifdef Py_UNICODE_WIDE
6521 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006522 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006523 "%c arg not in range(0x110000) "
6524 "(wide Python build)");
6525 return -1;
6526 }
6527#else
6528 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006529 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006530 "%c arg not in range(0x10000) "
6531 "(narrow Python build)");
6532 return -1;
6533 }
6534#endif
6535 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536 }
6537 buf[1] = '\0';
6538 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006539
6540 onError:
6541 PyErr_SetString(PyExc_TypeError,
6542 "%c requires int or char");
6543 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544}
6545
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006546/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6547
6548 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6549 chars are formatted. XXX This is a magic number. Each formatting
6550 routine does bounds checking to ensure no overflow, but a better
6551 solution may be to malloc a buffer of appropriate size for each
6552 format. For now, the current solution is sufficient.
6553*/
6554#define FORMATBUFLEN (size_t)120
6555
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556PyObject *PyUnicode_Format(PyObject *format,
6557 PyObject *args)
6558{
6559 Py_UNICODE *fmt, *res;
6560 int fmtcnt, rescnt, reslen, arglen, argidx;
6561 int args_owned = 0;
6562 PyUnicodeObject *result = NULL;
6563 PyObject *dict = NULL;
6564 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006565
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566 if (format == NULL || args == NULL) {
6567 PyErr_BadInternalCall();
6568 return NULL;
6569 }
6570 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006571 if (uformat == NULL)
6572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573 fmt = PyUnicode_AS_UNICODE(uformat);
6574 fmtcnt = PyUnicode_GET_SIZE(uformat);
6575
6576 reslen = rescnt = fmtcnt + 100;
6577 result = _PyUnicode_New(reslen);
6578 if (result == NULL)
6579 goto onError;
6580 res = PyUnicode_AS_UNICODE(result);
6581
6582 if (PyTuple_Check(args)) {
6583 arglen = PyTuple_Size(args);
6584 argidx = 0;
6585 }
6586 else {
6587 arglen = -1;
6588 argidx = -2;
6589 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006590 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6591 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 dict = args;
6593
6594 while (--fmtcnt >= 0) {
6595 if (*fmt != '%') {
6596 if (--rescnt < 0) {
6597 rescnt = fmtcnt + 100;
6598 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006599 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 return NULL;
6601 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6602 --rescnt;
6603 }
6604 *res++ = *fmt++;
6605 }
6606 else {
6607 /* Got a format specifier */
6608 int flags = 0;
6609 int width = -1;
6610 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 Py_UNICODE c = '\0';
6612 Py_UNICODE fill;
6613 PyObject *v = NULL;
6614 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006615 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 Py_UNICODE sign;
6617 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006618 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619
6620 fmt++;
6621 if (*fmt == '(') {
6622 Py_UNICODE *keystart;
6623 int keylen;
6624 PyObject *key;
6625 int pcount = 1;
6626
6627 if (dict == NULL) {
6628 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006629 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630 goto onError;
6631 }
6632 ++fmt;
6633 --fmtcnt;
6634 keystart = fmt;
6635 /* Skip over balanced parentheses */
6636 while (pcount > 0 && --fmtcnt >= 0) {
6637 if (*fmt == ')')
6638 --pcount;
6639 else if (*fmt == '(')
6640 ++pcount;
6641 fmt++;
6642 }
6643 keylen = fmt - keystart - 1;
6644 if (fmtcnt < 0 || pcount > 0) {
6645 PyErr_SetString(PyExc_ValueError,
6646 "incomplete format key");
6647 goto onError;
6648 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006649#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006650 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 then looked up since Python uses strings to hold
6652 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006653 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654 key = PyUnicode_EncodeUTF8(keystart,
6655 keylen,
6656 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006657#else
6658 key = PyUnicode_FromUnicode(keystart, keylen);
6659#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 if (key == NULL)
6661 goto onError;
6662 if (args_owned) {
6663 Py_DECREF(args);
6664 args_owned = 0;
6665 }
6666 args = PyObject_GetItem(dict, key);
6667 Py_DECREF(key);
6668 if (args == NULL) {
6669 goto onError;
6670 }
6671 args_owned = 1;
6672 arglen = -1;
6673 argidx = -2;
6674 }
6675 while (--fmtcnt >= 0) {
6676 switch (c = *fmt++) {
6677 case '-': flags |= F_LJUST; continue;
6678 case '+': flags |= F_SIGN; continue;
6679 case ' ': flags |= F_BLANK; continue;
6680 case '#': flags |= F_ALT; continue;
6681 case '0': flags |= F_ZERO; continue;
6682 }
6683 break;
6684 }
6685 if (c == '*') {
6686 v = getnextarg(args, arglen, &argidx);
6687 if (v == NULL)
6688 goto onError;
6689 if (!PyInt_Check(v)) {
6690 PyErr_SetString(PyExc_TypeError,
6691 "* wants int");
6692 goto onError;
6693 }
6694 width = PyInt_AsLong(v);
6695 if (width < 0) {
6696 flags |= F_LJUST;
6697 width = -width;
6698 }
6699 if (--fmtcnt >= 0)
6700 c = *fmt++;
6701 }
6702 else if (c >= '0' && c <= '9') {
6703 width = c - '0';
6704 while (--fmtcnt >= 0) {
6705 c = *fmt++;
6706 if (c < '0' || c > '9')
6707 break;
6708 if ((width*10) / 10 != width) {
6709 PyErr_SetString(PyExc_ValueError,
6710 "width too big");
6711 goto onError;
6712 }
6713 width = width*10 + (c - '0');
6714 }
6715 }
6716 if (c == '.') {
6717 prec = 0;
6718 if (--fmtcnt >= 0)
6719 c = *fmt++;
6720 if (c == '*') {
6721 v = getnextarg(args, arglen, &argidx);
6722 if (v == NULL)
6723 goto onError;
6724 if (!PyInt_Check(v)) {
6725 PyErr_SetString(PyExc_TypeError,
6726 "* wants int");
6727 goto onError;
6728 }
6729 prec = PyInt_AsLong(v);
6730 if (prec < 0)
6731 prec = 0;
6732 if (--fmtcnt >= 0)
6733 c = *fmt++;
6734 }
6735 else if (c >= '0' && c <= '9') {
6736 prec = c - '0';
6737 while (--fmtcnt >= 0) {
6738 c = Py_CHARMASK(*fmt++);
6739 if (c < '0' || c > '9')
6740 break;
6741 if ((prec*10) / 10 != prec) {
6742 PyErr_SetString(PyExc_ValueError,
6743 "prec too big");
6744 goto onError;
6745 }
6746 prec = prec*10 + (c - '0');
6747 }
6748 }
6749 } /* prec */
6750 if (fmtcnt >= 0) {
6751 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 if (--fmtcnt >= 0)
6753 c = *fmt++;
6754 }
6755 }
6756 if (fmtcnt < 0) {
6757 PyErr_SetString(PyExc_ValueError,
6758 "incomplete format");
6759 goto onError;
6760 }
6761 if (c != '%') {
6762 v = getnextarg(args, arglen, &argidx);
6763 if (v == NULL)
6764 goto onError;
6765 }
6766 sign = 0;
6767 fill = ' ';
6768 switch (c) {
6769
6770 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006771 pbuf = formatbuf;
6772 /* presume that buffer length is at least 1 */
6773 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 len = 1;
6775 break;
6776
6777 case 's':
6778 case 'r':
6779 if (PyUnicode_Check(v) && c == 's') {
6780 temp = v;
6781 Py_INCREF(temp);
6782 }
6783 else {
6784 PyObject *unicode;
6785 if (c == 's')
6786 temp = PyObject_Str(v);
6787 else
6788 temp = PyObject_Repr(v);
6789 if (temp == NULL)
6790 goto onError;
6791 if (!PyString_Check(temp)) {
6792 /* XXX Note: this should never happen, since
6793 PyObject_Repr() and PyObject_Str() assure
6794 this */
6795 Py_DECREF(temp);
6796 PyErr_SetString(PyExc_TypeError,
6797 "%s argument has non-string str()");
6798 goto onError;
6799 }
Fred Drakee4315f52000-05-09 19:53:39 +00006800 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006802 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803 "strict");
6804 Py_DECREF(temp);
6805 temp = unicode;
6806 if (temp == NULL)
6807 goto onError;
6808 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006809 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810 len = PyUnicode_GET_SIZE(temp);
6811 if (prec >= 0 && len > prec)
6812 len = prec;
6813 break;
6814
6815 case 'i':
6816 case 'd':
6817 case 'u':
6818 case 'o':
6819 case 'x':
6820 case 'X':
6821 if (c == 'i')
6822 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006823 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006824 temp = formatlong(v, flags, prec, c);
6825 if (!temp)
6826 goto onError;
6827 pbuf = PyUnicode_AS_UNICODE(temp);
6828 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00006829 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006831 else {
6832 pbuf = formatbuf;
6833 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6834 flags, prec, c, v);
6835 if (len < 0)
6836 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006837 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006838 }
6839 if (flags & F_ZERO)
6840 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841 break;
6842
6843 case 'e':
6844 case 'E':
6845 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006846 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847 case 'g':
6848 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006849 if (c == 'F')
6850 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006851 pbuf = formatbuf;
6852 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6853 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 if (len < 0)
6855 goto onError;
6856 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006857 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 fill = '0';
6859 break;
6860
6861 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006862 pbuf = formatbuf;
6863 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864 if (len < 0)
6865 goto onError;
6866 break;
6867
6868 default:
6869 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006870 "unsupported format character '%c' (0x%x) "
6871 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00006872 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006873 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006874 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875 goto onError;
6876 }
6877 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006878 if (*pbuf == '-' || *pbuf == '+') {
6879 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880 len--;
6881 }
6882 else if (flags & F_SIGN)
6883 sign = '+';
6884 else if (flags & F_BLANK)
6885 sign = ' ';
6886 else
6887 sign = 0;
6888 }
6889 if (width < len)
6890 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006891 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 reslen -= rescnt;
6893 rescnt = width + fmtcnt + 100;
6894 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006895 if (reslen < 0) {
6896 Py_DECREF(result);
6897 return PyErr_NoMemory();
6898 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006899 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900 return NULL;
6901 res = PyUnicode_AS_UNICODE(result)
6902 + reslen - rescnt;
6903 }
6904 if (sign) {
6905 if (fill != ' ')
6906 *res++ = sign;
6907 rescnt--;
6908 if (width > len)
6909 width--;
6910 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006911 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6912 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006913 assert(pbuf[1] == c);
6914 if (fill != ' ') {
6915 *res++ = *pbuf++;
6916 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00006917 }
Tim Petersfff53252001-04-12 18:38:48 +00006918 rescnt -= 2;
6919 width -= 2;
6920 if (width < 0)
6921 width = 0;
6922 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00006923 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924 if (width > len && !(flags & F_LJUST)) {
6925 do {
6926 --rescnt;
6927 *res++ = fill;
6928 } while (--width > len);
6929 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006930 if (fill == ' ') {
6931 if (sign)
6932 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00006933 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006934 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006935 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00006936 *res++ = *pbuf++;
6937 *res++ = *pbuf++;
6938 }
6939 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006940 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 res += len;
6942 rescnt -= len;
6943 while (--width >= len) {
6944 --rescnt;
6945 *res++ = ' ';
6946 }
6947 if (dict && (argidx < arglen) && c != '%') {
6948 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006949 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 goto onError;
6951 }
6952 Py_XDECREF(temp);
6953 } /* '%' */
6954 } /* until end */
6955 if (argidx < arglen && !dict) {
6956 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006957 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958 goto onError;
6959 }
6960
6961 if (args_owned) {
6962 Py_DECREF(args);
6963 }
6964 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00006965 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006966 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967 return (PyObject *)result;
6968
6969 onError:
6970 Py_XDECREF(result);
6971 Py_DECREF(uformat);
6972 if (args_owned) {
6973 Py_DECREF(args);
6974 }
6975 return NULL;
6976}
6977
6978static PyBufferProcs unicode_as_buffer = {
6979 (getreadbufferproc) unicode_buffer_getreadbuf,
6980 (getwritebufferproc) unicode_buffer_getwritebuf,
6981 (getsegcountproc) unicode_buffer_getsegcount,
6982 (getcharbufferproc) unicode_buffer_getcharbuf,
6983};
6984
Jeremy Hylton938ace62002-07-17 16:30:39 +00006985static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00006986unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6987
Tim Peters6d6c1a32001-08-02 04:15:00 +00006988static PyObject *
6989unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6990{
6991 PyObject *x = NULL;
6992 static char *kwlist[] = {"string", "encoding", "errors", 0};
6993 char *encoding = NULL;
6994 char *errors = NULL;
6995
Guido van Rossume023fe02001-08-30 03:12:59 +00006996 if (type != &PyUnicode_Type)
6997 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00006998 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6999 kwlist, &x, &encoding, &errors))
7000 return NULL;
7001 if (x == NULL)
7002 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007003 if (encoding == NULL && errors == NULL)
7004 return PyObject_Unicode(x);
7005 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007006 return PyUnicode_FromEncodedObject(x, encoding, errors);
7007}
7008
Guido van Rossume023fe02001-08-30 03:12:59 +00007009static PyObject *
7010unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7011{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007012 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007013 int n;
7014
7015 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7016 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7017 if (tmp == NULL)
7018 return NULL;
7019 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007020 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007021 if (pnew == NULL) {
7022 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007023 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007024 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007025 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7026 if (pnew->str == NULL) {
7027 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007028 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007029 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007030 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007031 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007032 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7033 pnew->length = n;
7034 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007035 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007036 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007037}
7038
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007039PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007040"unicode(string [, encoding[, errors]]) -> object\n\
7041\n\
7042Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007043encoding defaults to the current default string encoding.\n\
7044errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007045
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046PyTypeObject PyUnicode_Type = {
7047 PyObject_HEAD_INIT(&PyType_Type)
7048 0, /* ob_size */
7049 "unicode", /* tp_name */
7050 sizeof(PyUnicodeObject), /* tp_size */
7051 0, /* tp_itemsize */
7052 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007053 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007055 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056 0, /* tp_setattr */
7057 (cmpfunc) unicode_compare, /* tp_compare */
7058 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007059 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007061 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062 (hashfunc) unicode_hash, /* tp_hash*/
7063 0, /* tp_call*/
7064 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007065 PyObject_GenericGetAttr, /* tp_getattro */
7066 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007068 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7069 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007070 unicode_doc, /* tp_doc */
7071 0, /* tp_traverse */
7072 0, /* tp_clear */
7073 0, /* tp_richcompare */
7074 0, /* tp_weaklistoffset */
7075 0, /* tp_iter */
7076 0, /* tp_iternext */
7077 unicode_methods, /* tp_methods */
7078 0, /* tp_members */
7079 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007080 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007081 0, /* tp_dict */
7082 0, /* tp_descr_get */
7083 0, /* tp_descr_set */
7084 0, /* tp_dictoffset */
7085 0, /* tp_init */
7086 0, /* tp_alloc */
7087 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007088 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089};
7090
7091/* Initialize the Unicode implementation */
7092
Thomas Wouters78890102000-07-22 19:25:51 +00007093void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007095 int i;
7096
Fred Drakee4315f52000-05-09 19:53:39 +00007097 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007098 unicode_freelist = NULL;
7099 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007100 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007101 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007102 for (i = 0; i < 256; i++)
7103 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007104 if (PyType_Ready(&PyUnicode_Type) < 0)
7105 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106}
7107
7108/* Finalize the Unicode implementation */
7109
7110void
Thomas Wouters78890102000-07-22 19:25:51 +00007111_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007113 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007114 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007116 Py_XDECREF(unicode_empty);
7117 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007118
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007119 for (i = 0; i < 256; i++) {
7120 if (unicode_latin1[i]) {
7121 Py_DECREF(unicode_latin1[i]);
7122 unicode_latin1[i] = NULL;
7123 }
7124 }
7125
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007126 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127 PyUnicodeObject *v = u;
7128 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007129 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007130 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007131 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007132 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007134 unicode_freelist = NULL;
7135 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007137
7138/*
7139Local variables:
7140c-basic-offset: 4
7141indent-tabs-mode: nil
7142End:
7143*/