blob: 5e5dac55a775d3b57efe65cbb92cf5647107ef5e [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
157 unicode->length = length;
158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000266int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000371 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000387
388 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000390 size = PyUnicode_GET_SIZE(unicode) + 1;
391
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392#ifdef HAVE_USABLE_WCHAR_T
393 memcpy(w, unicode->str, size * sizeof(wchar_t));
394#else
395 {
396 register Py_UNICODE *u;
397 register int i;
398 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000399 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400 *w++ = *u++;
401 }
402#endif
403
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000404 if (size > PyUnicode_GET_SIZE(unicode))
405 return PyUnicode_GET_SIZE(unicode);
406 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000407 return size;
408}
409
410#endif
411
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000412PyObject *PyUnicode_FromOrdinal(int ordinal)
413{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000414 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000415
416#ifdef Py_UNICODE_WIDE
417 if (ordinal < 0 || ordinal > 0x10ffff) {
418 PyErr_SetString(PyExc_ValueError,
419 "unichr() arg not in range(0x110000) "
420 "(wide Python build)");
421 return NULL;
422 }
423#else
424 if (ordinal < 0 || ordinal > 0xffff) {
425 PyErr_SetString(PyExc_ValueError,
426 "unichr() arg not in range(0x10000) "
427 "(narrow Python build)");
428 return NULL;
429 }
430#endif
431
Hye-Shik Chang40574832004-04-06 07:24:51 +0000432 s[0] = (Py_UNICODE)ordinal;
433 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000434}
435
Guido van Rossumd57fd912000-03-10 22:53:23 +0000436PyObject *PyUnicode_FromObject(register PyObject *obj)
437{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000438 /* XXX Perhaps we should make this API an alias of
439 PyObject_Unicode() instead ?! */
440 if (PyUnicode_CheckExact(obj)) {
441 Py_INCREF(obj);
442 return obj;
443 }
444 if (PyUnicode_Check(obj)) {
445 /* For a Unicode subtype that's not a Unicode object,
446 return a true Unicode object with the same data. */
447 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
448 PyUnicode_GET_SIZE(obj));
449 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000450 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
451}
452
453PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
454 const char *encoding,
455 const char *errors)
456{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000457 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000458 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000460
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461 if (obj == NULL) {
462 PyErr_BadInternalCall();
463 return NULL;
464 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000465
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000466#if 0
467 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000468 that no encodings is given and then redirect to
469 PyObject_Unicode() which then applies the additional logic for
470 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000471
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000472 NOTE: This API should really only be used for object which
473 represent *encoded* Unicode !
474
475 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000476 if (PyUnicode_Check(obj)) {
477 if (encoding) {
478 PyErr_SetString(PyExc_TypeError,
479 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000480 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000481 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000482 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000483 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000484#else
485 if (PyUnicode_Check(obj)) {
486 PyErr_SetString(PyExc_TypeError,
487 "decoding Unicode is not supported");
488 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000489 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000490#endif
491
492 /* Coerce object */
493 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000494 s = PyString_AS_STRING(obj);
495 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000496 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000497 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
498 /* Overwrite the error message with something more useful in
499 case of a TypeError. */
500 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000501 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000502 "coercing to Unicode: need string or buffer, "
503 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000504 obj->ob_type->tp_name);
505 goto onError;
506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 if (len == 0) {
510 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000511 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000512 }
Tim Petersced69f82003-09-16 20:30:58 +0000513 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000514 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000515
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 return v;
517
518 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000519 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000520}
521
522PyObject *PyUnicode_Decode(const char *s,
523 int size,
524 const char *encoding,
525 const char *errors)
526{
527 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000528
529 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000530 encoding = PyUnicode_GetDefaultEncoding();
531
532 /* Shortcuts for common default encodings */
533 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000534 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000535 else if (strcmp(encoding, "latin-1") == 0)
536 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000537#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
538 else if (strcmp(encoding, "mbcs") == 0)
539 return PyUnicode_DecodeMBCS(s, size, errors);
540#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000541 else if (strcmp(encoding, "ascii") == 0)
542 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543
544 /* Decode via the codec registry */
545 buffer = PyBuffer_FromMemory((void *)s, size);
546 if (buffer == NULL)
547 goto onError;
548 unicode = PyCodec_Decode(buffer, encoding, errors);
549 if (unicode == NULL)
550 goto onError;
551 if (!PyUnicode_Check(unicode)) {
552 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000553 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000554 unicode->ob_type->tp_name);
555 Py_DECREF(unicode);
556 goto onError;
557 }
558 Py_DECREF(buffer);
559 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000560
Guido van Rossumd57fd912000-03-10 22:53:23 +0000561 onError:
562 Py_XDECREF(buffer);
563 return NULL;
564}
565
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000566PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
567 const char *encoding,
568 const char *errors)
569{
570 PyObject *v;
571
572 if (!PyUnicode_Check(unicode)) {
573 PyErr_BadArgument();
574 goto onError;
575 }
576
577 if (encoding == NULL)
578 encoding = PyUnicode_GetDefaultEncoding();
579
580 /* Decode via the codec registry */
581 v = PyCodec_Decode(unicode, encoding, errors);
582 if (v == NULL)
583 goto onError;
584 return v;
585
586 onError:
587 return NULL;
588}
589
Guido van Rossumd57fd912000-03-10 22:53:23 +0000590PyObject *PyUnicode_Encode(const Py_UNICODE *s,
591 int size,
592 const char *encoding,
593 const char *errors)
594{
595 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000596
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597 unicode = PyUnicode_FromUnicode(s, size);
598 if (unicode == NULL)
599 return NULL;
600 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
601 Py_DECREF(unicode);
602 return v;
603}
604
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000605PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
606 const char *encoding,
607 const char *errors)
608{
609 PyObject *v;
610
611 if (!PyUnicode_Check(unicode)) {
612 PyErr_BadArgument();
613 goto onError;
614 }
615
616 if (encoding == NULL)
617 encoding = PyUnicode_GetDefaultEncoding();
618
619 /* Encode via the codec registry */
620 v = PyCodec_Encode(unicode, encoding, errors);
621 if (v == NULL)
622 goto onError;
623 return v;
624
625 onError:
626 return NULL;
627}
628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
630 const char *encoding,
631 const char *errors)
632{
633 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000634
Guido van Rossumd57fd912000-03-10 22:53:23 +0000635 if (!PyUnicode_Check(unicode)) {
636 PyErr_BadArgument();
637 goto onError;
638 }
Fred Drakee4315f52000-05-09 19:53:39 +0000639
Tim Petersced69f82003-09-16 20:30:58 +0000640 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000641 encoding = PyUnicode_GetDefaultEncoding();
642
643 /* Shortcuts for common default encodings */
644 if (errors == NULL) {
645 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000646 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000647 else if (strcmp(encoding, "latin-1") == 0)
648 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000649#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
650 else if (strcmp(encoding, "mbcs") == 0)
651 return PyUnicode_AsMBCSString(unicode);
652#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000653 else if (strcmp(encoding, "ascii") == 0)
654 return PyUnicode_AsASCIIString(unicode);
655 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656
657 /* Encode via the codec registry */
658 v = PyCodec_Encode(unicode, encoding, errors);
659 if (v == NULL)
660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 if (!PyString_Check(v)) {
662 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000663 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 v->ob_type->tp_name);
665 Py_DECREF(v);
666 goto onError;
667 }
668 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000669
Guido van Rossumd57fd912000-03-10 22:53:23 +0000670 onError:
671 return NULL;
672}
673
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000674PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
675 const char *errors)
676{
677 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
678
679 if (v)
680 return v;
681 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
682 if (v && errors == NULL)
683 ((PyUnicodeObject *)unicode)->defenc = v;
684 return v;
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
688{
689 if (!PyUnicode_Check(unicode)) {
690 PyErr_BadArgument();
691 goto onError;
692 }
693 return PyUnicode_AS_UNICODE(unicode);
694
695 onError:
696 return NULL;
697}
698
699int PyUnicode_GetSize(PyObject *unicode)
700{
701 if (!PyUnicode_Check(unicode)) {
702 PyErr_BadArgument();
703 goto onError;
704 }
705 return PyUnicode_GET_SIZE(unicode);
706
707 onError:
708 return -1;
709}
710
Thomas Wouters78890102000-07-22 19:25:51 +0000711const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000712{
713 return unicode_default_encoding;
714}
715
716int PyUnicode_SetDefaultEncoding(const char *encoding)
717{
718 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000719
Fred Drakee4315f52000-05-09 19:53:39 +0000720 /* Make sure the encoding is valid. As side effect, this also
721 loads the encoding into the codec registry cache. */
722 v = _PyCodec_Lookup(encoding);
723 if (v == NULL)
724 goto onError;
725 Py_DECREF(v);
726 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000727 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000728 sizeof(unicode_default_encoding));
729 return 0;
730
731 onError:
732 return -1;
733}
734
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000735/* error handling callback helper:
736 build arguments, call the callback and check the arguments,
737 if no exception occured, copy the replacement to the output
738 and adjust various state variables.
739 return 0 on success, -1 on error
740*/
741
742static
743int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
744 const char *encoding, const char *reason,
745 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
746 PyObject **output, int *outpos, Py_UNICODE **outptr)
747{
748 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
749
750 PyObject *restuple = NULL;
751 PyObject *repunicode = NULL;
752 int outsize = PyUnicode_GET_SIZE(*output);
753 int requiredsize;
754 int newpos;
755 Py_UNICODE *repptr;
756 int repsize;
757 int res = -1;
758
759 if (*errorHandler == NULL) {
760 *errorHandler = PyCodec_LookupError(errors);
761 if (*errorHandler == NULL)
762 goto onError;
763 }
764
765 if (*exceptionObject == NULL) {
766 *exceptionObject = PyUnicodeDecodeError_Create(
767 encoding, input, insize, *startinpos, *endinpos, reason);
768 if (*exceptionObject == NULL)
769 goto onError;
770 }
771 else {
772 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
773 goto onError;
774 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
775 goto onError;
776 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
777 goto onError;
778 }
779
780 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
781 if (restuple == NULL)
782 goto onError;
783 if (!PyTuple_Check(restuple)) {
784 PyErr_Format(PyExc_TypeError, &argparse[4]);
785 goto onError;
786 }
787 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
788 goto onError;
789 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000790 newpos = insize+newpos;
791 if (newpos<0 || newpos>insize) {
792 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
793 goto onError;
794 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000795
796 /* need more space? (at least enough for what we
797 have+the replacement+the rest of the string (starting
798 at the new input position), so we won't have to check space
799 when there are no errors in the rest of the string) */
800 repptr = PyUnicode_AS_UNICODE(repunicode);
801 repsize = PyUnicode_GET_SIZE(repunicode);
802 requiredsize = *outpos + repsize + insize-newpos;
803 if (requiredsize > outsize) {
804 if (requiredsize<2*outsize)
805 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000806 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000807 goto onError;
808 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
809 }
810 *endinpos = newpos;
811 *inptr = input + newpos;
812 Py_UNICODE_COPY(*outptr, repptr, repsize);
813 *outptr += repsize;
814 *outpos += repsize;
815 /* we made it! */
816 res = 0;
817
818 onError:
819 Py_XDECREF(restuple);
820 return res;
821}
822
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000823/* --- UTF-7 Codec -------------------------------------------------------- */
824
825/* see RFC2152 for details */
826
Tim Petersced69f82003-09-16 20:30:58 +0000827static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000828char utf7_special[128] = {
829 /* indicate whether a UTF-7 character is special i.e. cannot be directly
830 encoded:
831 0 - not special
832 1 - special
833 2 - whitespace (optional)
834 3 - RFC2152 Set O (optional) */
835 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
836 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
837 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
838 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
839 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
840 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
841 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
842 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
843
844};
845
846#define SPECIAL(c, encodeO, encodeWS) \
847 (((c)>127 || utf7_special[(c)] == 1) || \
848 (encodeWS && (utf7_special[(c)] == 2)) || \
849 (encodeO && (utf7_special[(c)] == 3)))
850
851#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
852#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
853#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
854 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
855
856#define ENCODE(out, ch, bits) \
857 while (bits >= 6) { \
858 *out++ = B64(ch >> (bits-6)); \
859 bits -= 6; \
860 }
861
862#define DECODE(out, ch, bits, surrogate) \
863 while (bits >= 16) { \
864 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
865 bits -= 16; \
866 if (surrogate) { \
867 /* We have already generated an error for the high surrogate
868 so let's not bother seeing if the low surrogate is correct or not */\
869 surrogate = 0; \
870 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
871 /* This is a surrogate pair. Unfortunately we can't represent \
872 it in a 16-bit character */ \
873 surrogate = 1; \
874 errmsg = "code pairs are not supported"; \
875 goto utf7Error; \
876 } else { \
877 *out++ = outCh; \
878 } \
879 } \
880
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881PyObject *PyUnicode_DecodeUTF7(const char *s,
882 int size,
883 const char *errors)
884{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000885 const char *starts = s;
886 int startinpos;
887 int endinpos;
888 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000889 const char *e;
890 PyUnicodeObject *unicode;
891 Py_UNICODE *p;
892 const char *errmsg = "";
893 int inShift = 0;
894 unsigned int bitsleft = 0;
895 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000896 int surrogate = 0;
897 PyObject *errorHandler = NULL;
898 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000899
900 unicode = _PyUnicode_New(size);
901 if (!unicode)
902 return NULL;
903 if (size == 0)
904 return (PyObject *)unicode;
905
906 p = unicode->str;
907 e = s + size;
908
909 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000910 Py_UNICODE ch;
911 restart:
912 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000913
914 if (inShift) {
915 if ((ch == '-') || !B64CHAR(ch)) {
916 inShift = 0;
917 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000918
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000919 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
920 if (bitsleft >= 6) {
921 /* The shift sequence has a partial character in it. If
922 bitsleft < 6 then we could just classify it as padding
923 but that is not the case here */
924
925 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000926 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000927 }
928 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000929 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000930 here so indicate the potential of a misencoded character. */
931
932 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
933 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
934 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000935 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000936 }
937
938 if (ch == '-') {
939 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000940 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000941 inShift = 1;
942 }
943 } else if (SPECIAL(ch,0,0)) {
944 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000945 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000946 } else {
947 *p++ = ch;
948 }
949 } else {
950 charsleft = (charsleft << 6) | UB64(ch);
951 bitsleft += 6;
952 s++;
953 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
954 }
955 }
956 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000957 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000958 s++;
959 if (s < e && *s == '-') {
960 s++;
961 *p++ = '+';
962 } else
963 {
964 inShift = 1;
965 bitsleft = 0;
966 }
967 }
968 else if (SPECIAL(ch,0,0)) {
969 errmsg = "unexpected special character";
970 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000971 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000972 }
973 else {
974 *p++ = ch;
975 s++;
976 }
977 continue;
978 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000979 outpos = p-PyUnicode_AS_UNICODE(unicode);
980 endinpos = s-starts;
981 if (unicode_decode_call_errorhandler(
982 errors, &errorHandler,
983 "utf7", errmsg,
984 starts, size, &startinpos, &endinpos, &exc, &s,
985 (PyObject **)&unicode, &outpos, &p))
986 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000987 }
988
989 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000990 outpos = p-PyUnicode_AS_UNICODE(unicode);
991 endinpos = size;
992 if (unicode_decode_call_errorhandler(
993 errors, &errorHandler,
994 "utf7", "unterminated shift sequence",
995 starts, size, &startinpos, &endinpos, &exc, &s,
996 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000998 if (s < e)
999 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001000 }
1001
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001002 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001003 goto onError;
1004
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001005 Py_XDECREF(errorHandler);
1006 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 return (PyObject *)unicode;
1008
1009onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001010 Py_XDECREF(errorHandler);
1011 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001012 Py_DECREF(unicode);
1013 return NULL;
1014}
1015
1016
1017PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1018 int size,
1019 int encodeSetO,
1020 int encodeWhiteSpace,
1021 const char *errors)
1022{
1023 PyObject *v;
1024 /* It might be possible to tighten this worst case */
1025 unsigned int cbAllocated = 5 * size;
1026 int inShift = 0;
1027 int i = 0;
1028 unsigned int bitsleft = 0;
1029 unsigned long charsleft = 0;
1030 char * out;
1031 char * start;
1032
1033 if (size == 0)
1034 return PyString_FromStringAndSize(NULL, 0);
1035
1036 v = PyString_FromStringAndSize(NULL, cbAllocated);
1037 if (v == NULL)
1038 return NULL;
1039
1040 start = out = PyString_AS_STRING(v);
1041 for (;i < size; ++i) {
1042 Py_UNICODE ch = s[i];
1043
1044 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001045 if (ch == '+') {
1046 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001047 *out++ = '-';
1048 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1049 charsleft = ch;
1050 bitsleft = 16;
1051 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001052 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001053 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001054 } else {
1055 *out++ = (char) ch;
1056 }
1057 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1059 *out++ = B64(charsleft << (6-bitsleft));
1060 charsleft = 0;
1061 bitsleft = 0;
1062 /* Characters not in the BASE64 set implicitly unshift the sequence
1063 so no '-' is required, except if the character is itself a '-' */
1064 if (B64CHAR(ch) || ch == '-') {
1065 *out++ = '-';
1066 }
1067 inShift = 0;
1068 *out++ = (char) ch;
1069 } else {
1070 bitsleft += 16;
1071 charsleft = (charsleft << 16) | ch;
1072 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1073
1074 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001075 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001076 or '-' then the shift sequence will be terminated implicitly and we
1077 don't have to insert a '-'. */
1078
1079 if (bitsleft == 0) {
1080 if (i + 1 < size) {
1081 Py_UNICODE ch2 = s[i+1];
1082
1083 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001084
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001085 } else if (B64CHAR(ch2) || ch2 == '-') {
1086 *out++ = '-';
1087 inShift = 0;
1088 } else {
1089 inShift = 0;
1090 }
1091
1092 }
1093 else {
1094 *out++ = '-';
1095 inShift = 0;
1096 }
1097 }
Tim Petersced69f82003-09-16 20:30:58 +00001098 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001099 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001100 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001101 if (bitsleft) {
1102 *out++= B64(charsleft << (6-bitsleft) );
1103 *out++ = '-';
1104 }
1105
Tim Peters5de98422002-04-27 18:44:32 +00001106 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001107 return v;
1108}
1109
1110#undef SPECIAL
1111#undef B64
1112#undef B64CHAR
1113#undef UB64
1114#undef ENCODE
1115#undef DECODE
1116
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117/* --- UTF-8 Codec -------------------------------------------------------- */
1118
Tim Petersced69f82003-09-16 20:30:58 +00001119static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001120char utf8_code_length[256] = {
1121 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1122 illegal prefix. see RFC 2279 for details */
1123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1124 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1125 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1126 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1127 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1128 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1130 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1131 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1132 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1133 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1134 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1135 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1136 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1137 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1138 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1139};
1140
Guido van Rossumd57fd912000-03-10 22:53:23 +00001141PyObject *PyUnicode_DecodeUTF8(const char *s,
1142 int size,
1143 const char *errors)
1144{
Walter Dörwald69652032004-09-07 20:24:22 +00001145 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1146}
1147
1148PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1149 int size,
1150 const char *errors,
1151 int *consumed)
1152{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001153 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001155 int startinpos;
1156 int endinpos;
1157 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158 const char *e;
1159 PyUnicodeObject *unicode;
1160 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001161 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001162 PyObject *errorHandler = NULL;
1163 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164
1165 /* Note: size will always be longer than the resulting Unicode
1166 character count */
1167 unicode = _PyUnicode_New(size);
1168 if (!unicode)
1169 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001170 if (size == 0) {
1171 if (consumed)
1172 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001174 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001175
1176 /* Unpack UTF-8 encoded data */
1177 p = unicode->str;
1178 e = s + size;
1179
1180 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001181 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182
1183 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001184 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185 s++;
1186 continue;
1187 }
1188
1189 n = utf8_code_length[ch];
1190
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001191 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001192 if (consumed)
1193 break;
1194 else {
1195 errmsg = "unexpected end of data";
1196 startinpos = s-starts;
1197 endinpos = size;
1198 goto utf8Error;
1199 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001200 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201
1202 switch (n) {
1203
1204 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001205 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001206 startinpos = s-starts;
1207 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001208 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209
1210 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001211 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001212 startinpos = s-starts;
1213 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001214 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215
1216 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001217 if ((s[1] & 0xc0) != 0x80) {
1218 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001219 startinpos = s-starts;
1220 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001221 goto utf8Error;
1222 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001224 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001225 startinpos = s-starts;
1226 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001227 errmsg = "illegal encoding";
1228 goto utf8Error;
1229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001231 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232 break;
1233
1234 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001235 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001236 (s[2] & 0xc0) != 0x80) {
1237 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001238 startinpos = s-starts;
1239 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001240 goto utf8Error;
1241 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001243 if (ch < 0x0800) {
1244 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001245 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001246
1247 XXX For wide builds (UCS-4) we should probably try
1248 to recombine the surrogates into a single code
1249 unit.
1250 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001251 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001252 startinpos = s-starts;
1253 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001254 goto utf8Error;
1255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001257 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001258 break;
1259
1260 case 4:
1261 if ((s[1] & 0xc0) != 0x80 ||
1262 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001263 (s[3] & 0xc0) != 0x80) {
1264 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001265 startinpos = s-starts;
1266 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001267 goto utf8Error;
1268 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001269 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1270 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1271 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001272 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001273 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001274 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001275 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001276 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001277 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001278 startinpos = s-starts;
1279 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001280 goto utf8Error;
1281 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001282#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001283 *p++ = (Py_UNICODE)ch;
1284#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001285 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001286
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001287 /* translate from 10000..10FFFF to 0..FFFF */
1288 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001289
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001290 /* high surrogate = top 10 bits added to D800 */
1291 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001292
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001293 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001294 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001295#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296 break;
1297
1298 default:
1299 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001300 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001301 startinpos = s-starts;
1302 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001303 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 }
1305 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001306 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001307
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001308 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001309 outpos = p-PyUnicode_AS_UNICODE(unicode);
1310 if (unicode_decode_call_errorhandler(
1311 errors, &errorHandler,
1312 "utf8", errmsg,
1313 starts, size, &startinpos, &endinpos, &exc, &s,
1314 (PyObject **)&unicode, &outpos, &p))
1315 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316 }
Walter Dörwald69652032004-09-07 20:24:22 +00001317 if (consumed)
1318 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001319
1320 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001321 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322 goto onError;
1323
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001324 Py_XDECREF(errorHandler);
1325 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001326 return (PyObject *)unicode;
1327
1328onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001329 Py_XDECREF(errorHandler);
1330 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001331 Py_DECREF(unicode);
1332 return NULL;
1333}
1334
Tim Peters602f7402002-04-27 18:03:26 +00001335/* Allocation strategy: if the string is short, convert into a stack buffer
1336 and allocate exactly as much space needed at the end. Else allocate the
1337 maximum possible needed (4 result bytes per Unicode character), and return
1338 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001339*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001340PyObject *
1341PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1342 int size,
1343 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001344{
Tim Peters602f7402002-04-27 18:03:26 +00001345#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001346
Tim Peters602f7402002-04-27 18:03:26 +00001347 int i; /* index into s of next input byte */
1348 PyObject *v; /* result string object */
1349 char *p; /* next free byte in output buffer */
1350 int nallocated; /* number of result bytes allocated */
1351 int nneeded; /* number of result bytes needed */
1352 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001353
Tim Peters602f7402002-04-27 18:03:26 +00001354 assert(s != NULL);
1355 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001356
Tim Peters602f7402002-04-27 18:03:26 +00001357 if (size <= MAX_SHORT_UNICHARS) {
1358 /* Write into the stack buffer; nallocated can't overflow.
1359 * At the end, we'll allocate exactly as much heap space as it
1360 * turns out we need.
1361 */
1362 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1363 v = NULL; /* will allocate after we're done */
1364 p = stackbuf;
1365 }
1366 else {
1367 /* Overallocate on the heap, and give the excess back at the end. */
1368 nallocated = size * 4;
1369 if (nallocated / 4 != size) /* overflow! */
1370 return PyErr_NoMemory();
1371 v = PyString_FromStringAndSize(NULL, nallocated);
1372 if (v == NULL)
1373 return NULL;
1374 p = PyString_AS_STRING(v);
1375 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001376
Tim Peters602f7402002-04-27 18:03:26 +00001377 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001378 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001379
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001380 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001381 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001383
Guido van Rossumd57fd912000-03-10 22:53:23 +00001384 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001385 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001386 *p++ = (char)(0xc0 | (ch >> 6));
1387 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001388 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001389 else {
Tim Peters602f7402002-04-27 18:03:26 +00001390 /* Encode UCS2 Unicode ordinals */
1391 if (ch < 0x10000) {
1392 /* Special case: check for high surrogate */
1393 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1394 Py_UCS4 ch2 = s[i];
1395 /* Check for low surrogate and combine the two to
1396 form a UCS4 value */
1397 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001398 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001399 i++;
1400 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001401 }
Tim Peters602f7402002-04-27 18:03:26 +00001402 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001403 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001404 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001405 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1406 *p++ = (char)(0x80 | (ch & 0x3f));
1407 continue;
1408 }
1409encodeUCS4:
1410 /* Encode UCS4 Unicode ordinals */
1411 *p++ = (char)(0xf0 | (ch >> 18));
1412 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1413 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1414 *p++ = (char)(0x80 | (ch & 0x3f));
1415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001416 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001417
Tim Peters602f7402002-04-27 18:03:26 +00001418 if (v == NULL) {
1419 /* This was stack allocated. */
1420 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1421 assert(nneeded <= nallocated);
1422 v = PyString_FromStringAndSize(stackbuf, nneeded);
1423 }
1424 else {
1425 /* Cut back to size actually needed. */
1426 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1427 assert(nneeded <= nallocated);
1428 _PyString_Resize(&v, nneeded);
1429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001431
Tim Peters602f7402002-04-27 18:03:26 +00001432#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001433}
1434
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1436{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001437 if (!PyUnicode_Check(unicode)) {
1438 PyErr_BadArgument();
1439 return NULL;
1440 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001441 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1442 PyUnicode_GET_SIZE(unicode),
1443 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444}
1445
1446/* --- UTF-16 Codec ------------------------------------------------------- */
1447
Tim Peters772747b2001-08-09 22:21:55 +00001448PyObject *
1449PyUnicode_DecodeUTF16(const char *s,
1450 int size,
1451 const char *errors,
1452 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001453{
Walter Dörwald69652032004-09-07 20:24:22 +00001454 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1455}
1456
1457PyObject *
1458PyUnicode_DecodeUTF16Stateful(const char *s,
1459 int size,
1460 const char *errors,
1461 int *byteorder,
1462 int *consumed)
1463{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001464 const char *starts = s;
1465 int startinpos;
1466 int endinpos;
1467 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001468 PyUnicodeObject *unicode;
1469 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001470 const unsigned char *q, *e;
1471 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001472 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001473 /* Offsets from q for retrieving byte pairs in the right order. */
1474#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1475 int ihi = 1, ilo = 0;
1476#else
1477 int ihi = 0, ilo = 1;
1478#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001479 PyObject *errorHandler = NULL;
1480 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001481
1482 /* Note: size will always be longer than the resulting Unicode
1483 character count */
1484 unicode = _PyUnicode_New(size);
1485 if (!unicode)
1486 return NULL;
1487 if (size == 0)
1488 return (PyObject *)unicode;
1489
1490 /* Unpack UTF-16 encoded data */
1491 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001492 q = (unsigned char *)s;
1493 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494
1495 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001496 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001498 /* Check for BOM marks (U+FEFF) in the input and adjust current
1499 byte order setting accordingly. In native mode, the leading BOM
1500 mark is skipped, in all other modes, it is copied to the output
1501 stream as-is (giving a ZWNBSP character). */
1502 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001503 if (size >= 2) {
1504 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001505#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001506 if (bom == 0xFEFF) {
1507 q += 2;
1508 bo = -1;
1509 }
1510 else if (bom == 0xFFFE) {
1511 q += 2;
1512 bo = 1;
1513 }
Tim Petersced69f82003-09-16 20:30:58 +00001514#else
Walter Dörwald69652032004-09-07 20:24:22 +00001515 if (bom == 0xFEFF) {
1516 q += 2;
1517 bo = 1;
1518 }
1519 else if (bom == 0xFFFE) {
1520 q += 2;
1521 bo = -1;
1522 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001523#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001524 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526
Tim Peters772747b2001-08-09 22:21:55 +00001527 if (bo == -1) {
1528 /* force LE */
1529 ihi = 1;
1530 ilo = 0;
1531 }
1532 else if (bo == 1) {
1533 /* force BE */
1534 ihi = 0;
1535 ilo = 1;
1536 }
1537
1538 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001539 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001540 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001541 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001542 if (consumed)
1543 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001544 errmsg = "truncated data";
1545 startinpos = ((const char *)q)-starts;
1546 endinpos = ((const char *)e)-starts;
1547 goto utf16Error;
1548 /* The remaining input chars are ignored if the callback
1549 chooses to skip the input */
1550 }
1551 ch = (q[ihi] << 8) | q[ilo];
1552
Tim Peters772747b2001-08-09 22:21:55 +00001553 q += 2;
1554
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555 if (ch < 0xD800 || ch > 0xDFFF) {
1556 *p++ = ch;
1557 continue;
1558 }
1559
1560 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001561 if (q >= e) {
1562 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001563 startinpos = (((const char *)q)-2)-starts;
1564 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001565 goto utf16Error;
1566 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001567 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001568 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1569 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001570 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001571#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001572 *p++ = ch;
1573 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001574#else
1575 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001576#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001577 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001578 }
1579 else {
1580 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001581 startinpos = (((const char *)q)-4)-starts;
1582 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001583 goto utf16Error;
1584 }
1585
Guido van Rossumd57fd912000-03-10 22:53:23 +00001586 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001587 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001588 startinpos = (((const char *)q)-2)-starts;
1589 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001590 /* Fall through to report the error */
1591
1592 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001593 outpos = p-PyUnicode_AS_UNICODE(unicode);
1594 if (unicode_decode_call_errorhandler(
1595 errors, &errorHandler,
1596 "utf16", errmsg,
1597 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1598 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001599 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001600 }
1601
1602 if (byteorder)
1603 *byteorder = bo;
1604
Walter Dörwald69652032004-09-07 20:24:22 +00001605 if (consumed)
1606 *consumed = (const char *)q-starts;
1607
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001609 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001610 goto onError;
1611
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001612 Py_XDECREF(errorHandler);
1613 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001614 return (PyObject *)unicode;
1615
1616onError:
1617 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001618 Py_XDECREF(errorHandler);
1619 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620 return NULL;
1621}
1622
Tim Peters772747b2001-08-09 22:21:55 +00001623PyObject *
1624PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1625 int size,
1626 const char *errors,
1627 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001628{
1629 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001630 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001631#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001632 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001633#else
1634 const int pairs = 0;
1635#endif
Tim Peters772747b2001-08-09 22:21:55 +00001636 /* Offsets from p for storing byte pairs in the right order. */
1637#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1638 int ihi = 1, ilo = 0;
1639#else
1640 int ihi = 0, ilo = 1;
1641#endif
1642
1643#define STORECHAR(CH) \
1644 do { \
1645 p[ihi] = ((CH) >> 8) & 0xff; \
1646 p[ilo] = (CH) & 0xff; \
1647 p += 2; \
1648 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001649
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001650#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001651 for (i = pairs = 0; i < size; i++)
1652 if (s[i] >= 0x10000)
1653 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001654#endif
Tim Petersced69f82003-09-16 20:30:58 +00001655 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001656 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657 if (v == NULL)
1658 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001659
Tim Peters772747b2001-08-09 22:21:55 +00001660 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001662 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001663 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001664 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001665
1666 if (byteorder == -1) {
1667 /* force LE */
1668 ihi = 1;
1669 ilo = 0;
1670 }
1671 else if (byteorder == 1) {
1672 /* force BE */
1673 ihi = 0;
1674 ilo = 1;
1675 }
1676
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001677 while (size-- > 0) {
1678 Py_UNICODE ch = *s++;
1679 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001680#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001681 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001682 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1683 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001685#endif
Tim Peters772747b2001-08-09 22:21:55 +00001686 STORECHAR(ch);
1687 if (ch2)
1688 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001689 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001690 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001691#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692}
1693
1694PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1695{
1696 if (!PyUnicode_Check(unicode)) {
1697 PyErr_BadArgument();
1698 return NULL;
1699 }
1700 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1701 PyUnicode_GET_SIZE(unicode),
1702 NULL,
1703 0);
1704}
1705
1706/* --- Unicode Escape Codec ----------------------------------------------- */
1707
Fredrik Lundh06d12682001-01-24 07:59:11 +00001708static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001709
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1711 int size,
1712 const char *errors)
1713{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001714 const char *starts = s;
1715 int startinpos;
1716 int endinpos;
1717 int outpos;
1718 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001719 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001720 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001721 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001722 char* message;
1723 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001724 PyObject *errorHandler = NULL;
1725 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001726
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 /* Escaped strings will always be longer than the resulting
1728 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001729 length after conversion to the true value.
1730 (but if the error callback returns a long replacement string
1731 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732 v = _PyUnicode_New(size);
1733 if (v == NULL)
1734 goto onError;
1735 if (size == 0)
1736 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001737
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001738 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001740
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741 while (s < end) {
1742 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001743 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001744 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745
1746 /* Non-escape characters are interpreted as Unicode ordinals */
1747 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001748 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 continue;
1750 }
1751
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001752 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753 /* \ - Escapes */
1754 s++;
1755 switch (*s++) {
1756
1757 /* \x escapes */
1758 case '\n': break;
1759 case '\\': *p++ = '\\'; break;
1760 case '\'': *p++ = '\''; break;
1761 case '\"': *p++ = '\"'; break;
1762 case 'b': *p++ = '\b'; break;
1763 case 'f': *p++ = '\014'; break; /* FF */
1764 case 't': *p++ = '\t'; break;
1765 case 'n': *p++ = '\n'; break;
1766 case 'r': *p++ = '\r'; break;
1767 case 'v': *p++ = '\013'; break; /* VT */
1768 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1769
1770 /* \OOO (octal) escapes */
1771 case '0': case '1': case '2': case '3':
1772 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001773 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001775 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001777 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001779 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 break;
1781
Fredrik Lundhccc74732001-02-18 22:13:49 +00001782 /* hex escapes */
1783 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001785 digits = 2;
1786 message = "truncated \\xXX escape";
1787 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788
Fredrik Lundhccc74732001-02-18 22:13:49 +00001789 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001791 digits = 4;
1792 message = "truncated \\uXXXX escape";
1793 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794
Fredrik Lundhccc74732001-02-18 22:13:49 +00001795 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001796 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001797 digits = 8;
1798 message = "truncated \\UXXXXXXXX escape";
1799 hexescape:
1800 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001801 outpos = p-PyUnicode_AS_UNICODE(v);
1802 if (s+digits>end) {
1803 endinpos = size;
1804 if (unicode_decode_call_errorhandler(
1805 errors, &errorHandler,
1806 "unicodeescape", "end of string in escape sequence",
1807 starts, size, &startinpos, &endinpos, &exc, &s,
1808 (PyObject **)&v, &outpos, &p))
1809 goto onError;
1810 goto nextByte;
1811 }
1812 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001813 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001814 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001815 endinpos = (s+i+1)-starts;
1816 if (unicode_decode_call_errorhandler(
1817 errors, &errorHandler,
1818 "unicodeescape", message,
1819 starts, size, &startinpos, &endinpos, &exc, &s,
1820 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001821 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001822 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001823 }
1824 chr = (chr<<4) & ~0xF;
1825 if (c >= '0' && c <= '9')
1826 chr += c - '0';
1827 else if (c >= 'a' && c <= 'f')
1828 chr += 10 + c - 'a';
1829 else
1830 chr += 10 + c - 'A';
1831 }
1832 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001833 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001834 /* _decoding_error will have already written into the
1835 target buffer. */
1836 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001837 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001838 /* when we get here, chr is a 32-bit unicode character */
1839 if (chr <= 0xffff)
1840 /* UCS-2 character */
1841 *p++ = (Py_UNICODE) chr;
1842 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001843 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001844 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001845#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001846 *p++ = chr;
1847#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001848 chr -= 0x10000L;
1849 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001850 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001851#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001852 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001853 endinpos = s-starts;
1854 outpos = p-PyUnicode_AS_UNICODE(v);
1855 if (unicode_decode_call_errorhandler(
1856 errors, &errorHandler,
1857 "unicodeescape", "illegal Unicode character",
1858 starts, size, &startinpos, &endinpos, &exc, &s,
1859 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001860 goto onError;
1861 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001862 break;
1863
1864 /* \N{name} */
1865 case 'N':
1866 message = "malformed \\N character escape";
1867 if (ucnhash_CAPI == NULL) {
1868 /* load the unicode data module */
1869 PyObject *m, *v;
1870 m = PyImport_ImportModule("unicodedata");
1871 if (m == NULL)
1872 goto ucnhashError;
1873 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1874 Py_DECREF(m);
1875 if (v == NULL)
1876 goto ucnhashError;
1877 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1878 Py_DECREF(v);
1879 if (ucnhash_CAPI == NULL)
1880 goto ucnhashError;
1881 }
1882 if (*s == '{') {
1883 const char *start = s+1;
1884 /* look for the closing brace */
1885 while (*s != '}' && s < end)
1886 s++;
1887 if (s > start && s < end && *s == '}') {
1888 /* found a name. look it up in the unicode database */
1889 message = "unknown Unicode character name";
1890 s++;
1891 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1892 goto store;
1893 }
1894 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001895 endinpos = s-starts;
1896 outpos = p-PyUnicode_AS_UNICODE(v);
1897 if (unicode_decode_call_errorhandler(
1898 errors, &errorHandler,
1899 "unicodeescape", message,
1900 starts, size, &startinpos, &endinpos, &exc, &s,
1901 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001902 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001903 break;
1904
1905 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001906 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001907 message = "\\ at end of string";
1908 s--;
1909 endinpos = s-starts;
1910 outpos = p-PyUnicode_AS_UNICODE(v);
1911 if (unicode_decode_call_errorhandler(
1912 errors, &errorHandler,
1913 "unicodeescape", message,
1914 starts, size, &startinpos, &endinpos, &exc, &s,
1915 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001916 goto onError;
1917 }
1918 else {
1919 *p++ = '\\';
1920 *p++ = (unsigned char)s[-1];
1921 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001922 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001923 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001924 nextByte:
1925 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001926 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001927 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001928 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001929 Py_XDECREF(errorHandler);
1930 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001931 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001932
Fredrik Lundhccc74732001-02-18 22:13:49 +00001933ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001934 PyErr_SetString(
1935 PyExc_UnicodeError,
1936 "\\N escapes not supported (can't load unicodedata module)"
1937 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001938 Py_XDECREF(errorHandler);
1939 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001940 return NULL;
1941
Fredrik Lundhccc74732001-02-18 22:13:49 +00001942onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001944 Py_XDECREF(errorHandler);
1945 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001946 return NULL;
1947}
1948
1949/* Return a Unicode-Escape string version of the Unicode object.
1950
1951 If quotes is true, the string is enclosed in u"" or u'' quotes as
1952 appropriate.
1953
1954*/
1955
Barry Warsaw51ac5802000-03-20 16:36:48 +00001956static const Py_UNICODE *findchar(const Py_UNICODE *s,
1957 int size,
1958 Py_UNICODE ch);
1959
Guido van Rossumd57fd912000-03-10 22:53:23 +00001960static
1961PyObject *unicodeescape_string(const Py_UNICODE *s,
1962 int size,
1963 int quotes)
1964{
1965 PyObject *repr;
1966 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001968 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969
1970 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1971 if (repr == NULL)
1972 return NULL;
1973
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001974 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975
1976 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001978 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 !findchar(s, size, '"')) ? '"' : '\'';
1980 }
1981 while (size-- > 0) {
1982 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001983
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001985 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001986 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 *p++ = '\\';
1988 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001989 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001990 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001991
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001992#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001993 /* Map 21-bit characters to '\U00xxxxxx' */
1994 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001995 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00001996
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001997 /* Resize the string if necessary */
1998 if (offset + 12 > PyString_GET_SIZE(repr)) {
1999 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002000 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002001 p = PyString_AS_STRING(repr) + offset;
2002 }
2003
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002004 *p++ = '\\';
2005 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002006 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2007 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2008 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2009 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2010 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2011 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2012 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002013 *p++ = hexdigit[ch & 0x0000000F];
2014 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002015 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002016#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002017 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2018 else if (ch >= 0xD800 && ch < 0xDC00) {
2019 Py_UNICODE ch2;
2020 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002021
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002022 ch2 = *s++;
2023 size--;
2024 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2025 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2026 *p++ = '\\';
2027 *p++ = 'U';
2028 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2029 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2030 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2031 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2032 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2033 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2034 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2035 *p++ = hexdigit[ucs & 0x0000000F];
2036 continue;
2037 }
2038 /* Fall through: isolated surrogates are copied as-is */
2039 s--;
2040 size++;
2041 }
2042
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002044 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045 *p++ = '\\';
2046 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002047 *p++ = hexdigit[(ch >> 12) & 0x000F];
2048 *p++ = hexdigit[(ch >> 8) & 0x000F];
2049 *p++ = hexdigit[(ch >> 4) & 0x000F];
2050 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002052
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002053 /* Map special whitespace to '\t', \n', '\r' */
2054 else if (ch == '\t') {
2055 *p++ = '\\';
2056 *p++ = 't';
2057 }
2058 else if (ch == '\n') {
2059 *p++ = '\\';
2060 *p++ = 'n';
2061 }
2062 else if (ch == '\r') {
2063 *p++ = '\\';
2064 *p++ = 'r';
2065 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002066
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002067 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002068 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002070 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002071 *p++ = hexdigit[(ch >> 4) & 0x000F];
2072 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002073 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002074
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075 /* Copy everything else as-is */
2076 else
2077 *p++ = (char) ch;
2078 }
2079 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002080 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081
2082 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002083 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084 return repr;
2085}
2086
2087PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2088 int size)
2089{
2090 return unicodeescape_string(s, size, 0);
2091}
2092
2093PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2094{
2095 if (!PyUnicode_Check(unicode)) {
2096 PyErr_BadArgument();
2097 return NULL;
2098 }
2099 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2100 PyUnicode_GET_SIZE(unicode));
2101}
2102
2103/* --- Raw Unicode Escape Codec ------------------------------------------- */
2104
2105PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2106 int size,
2107 const char *errors)
2108{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002109 const char *starts = s;
2110 int startinpos;
2111 int endinpos;
2112 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002114 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002115 const char *end;
2116 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002117 PyObject *errorHandler = NULL;
2118 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002119
Guido van Rossumd57fd912000-03-10 22:53:23 +00002120 /* Escaped strings will always be longer than the resulting
2121 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002122 length after conversion to the true value. (But decoding error
2123 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 v = _PyUnicode_New(size);
2125 if (v == NULL)
2126 goto onError;
2127 if (size == 0)
2128 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002129 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 end = s + size;
2131 while (s < end) {
2132 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002133 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002135 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136
2137 /* Non-escape characters are interpreted as Unicode ordinals */
2138 if (*s != '\\') {
2139 *p++ = (unsigned char)*s++;
2140 continue;
2141 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002142 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002143
2144 /* \u-escapes are only interpreted iff the number of leading
2145 backslashes if odd */
2146 bs = s;
2147 for (;s < end;) {
2148 if (*s != '\\')
2149 break;
2150 *p++ = (unsigned char)*s++;
2151 }
2152 if (((s - bs) & 1) == 0 ||
2153 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002154 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002155 continue;
2156 }
2157 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002158 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159 s++;
2160
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002161 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002162 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002163 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002164 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002166 endinpos = s-starts;
2167 if (unicode_decode_call_errorhandler(
2168 errors, &errorHandler,
2169 "rawunicodeescape", "truncated \\uXXXX",
2170 starts, size, &startinpos, &endinpos, &exc, &s,
2171 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002173 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174 }
2175 x = (x<<4) & ~0xF;
2176 if (c >= '0' && c <= '9')
2177 x += c - '0';
2178 else if (c >= 'a' && c <= 'f')
2179 x += 10 + c - 'a';
2180 else
2181 x += 10 + c - 'A';
2182 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002183#ifndef Py_UNICODE_WIDE
2184 if (x > 0x10000) {
2185 if (unicode_decode_call_errorhandler(
2186 errors, &errorHandler,
2187 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2188 starts, size, &startinpos, &endinpos, &exc, &s,
2189 (PyObject **)&v, &outpos, &p))
2190 goto onError;
2191 }
2192#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002193 *p++ = x;
2194 nextByte:
2195 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002197 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002198 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002199 Py_XDECREF(errorHandler);
2200 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002202
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203 onError:
2204 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002205 Py_XDECREF(errorHandler);
2206 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 return NULL;
2208}
2209
2210PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2211 int size)
2212{
2213 PyObject *repr;
2214 char *p;
2215 char *q;
2216
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002217 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002218
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002219#ifdef Py_UNICODE_WIDE
2220 repr = PyString_FromStringAndSize(NULL, 10 * size);
2221#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002223#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224 if (repr == NULL)
2225 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002226 if (size == 0)
2227 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228
2229 p = q = PyString_AS_STRING(repr);
2230 while (size-- > 0) {
2231 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002232#ifdef Py_UNICODE_WIDE
2233 /* Map 32-bit characters to '\Uxxxxxxxx' */
2234 if (ch >= 0x10000) {
2235 *p++ = '\\';
2236 *p++ = 'U';
2237 *p++ = hexdigit[(ch >> 28) & 0xf];
2238 *p++ = hexdigit[(ch >> 24) & 0xf];
2239 *p++ = hexdigit[(ch >> 20) & 0xf];
2240 *p++ = hexdigit[(ch >> 16) & 0xf];
2241 *p++ = hexdigit[(ch >> 12) & 0xf];
2242 *p++ = hexdigit[(ch >> 8) & 0xf];
2243 *p++ = hexdigit[(ch >> 4) & 0xf];
2244 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002245 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002246 else
2247#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 /* Map 16-bit characters to '\uxxxx' */
2249 if (ch >= 256) {
2250 *p++ = '\\';
2251 *p++ = 'u';
2252 *p++ = hexdigit[(ch >> 12) & 0xf];
2253 *p++ = hexdigit[(ch >> 8) & 0xf];
2254 *p++ = hexdigit[(ch >> 4) & 0xf];
2255 *p++ = hexdigit[ch & 15];
2256 }
2257 /* Copy everything else as-is */
2258 else
2259 *p++ = (char) ch;
2260 }
2261 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002262 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263 return repr;
2264}
2265
2266PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2267{
2268 if (!PyUnicode_Check(unicode)) {
2269 PyErr_BadArgument();
2270 return NULL;
2271 }
2272 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2273 PyUnicode_GET_SIZE(unicode));
2274}
2275
2276/* --- Latin-1 Codec ------------------------------------------------------ */
2277
2278PyObject *PyUnicode_DecodeLatin1(const char *s,
2279 int size,
2280 const char *errors)
2281{
2282 PyUnicodeObject *v;
2283 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002284
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002286 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002287 Py_UNICODE r = *(unsigned char*)s;
2288 return PyUnicode_FromUnicode(&r, 1);
2289 }
2290
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 v = _PyUnicode_New(size);
2292 if (v == NULL)
2293 goto onError;
2294 if (size == 0)
2295 return (PyObject *)v;
2296 p = PyUnicode_AS_UNICODE(v);
2297 while (size-- > 0)
2298 *p++ = (unsigned char)*s++;
2299 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002300
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301 onError:
2302 Py_XDECREF(v);
2303 return NULL;
2304}
2305
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002306/* create or adjust a UnicodeEncodeError */
2307static void make_encode_exception(PyObject **exceptionObject,
2308 const char *encoding,
2309 const Py_UNICODE *unicode, int size,
2310 int startpos, int endpos,
2311 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002313 if (*exceptionObject == NULL) {
2314 *exceptionObject = PyUnicodeEncodeError_Create(
2315 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002316 }
2317 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002318 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2319 goto onError;
2320 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2321 goto onError;
2322 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2323 goto onError;
2324 return;
2325 onError:
2326 Py_DECREF(*exceptionObject);
2327 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002328 }
2329}
2330
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002331/* raises a UnicodeEncodeError */
2332static void raise_encode_exception(PyObject **exceptionObject,
2333 const char *encoding,
2334 const Py_UNICODE *unicode, int size,
2335 int startpos, int endpos,
2336 const char *reason)
2337{
2338 make_encode_exception(exceptionObject,
2339 encoding, unicode, size, startpos, endpos, reason);
2340 if (*exceptionObject != NULL)
2341 PyCodec_StrictErrors(*exceptionObject);
2342}
2343
2344/* error handling callback helper:
2345 build arguments, call the callback and check the arguments,
2346 put the result into newpos and return the replacement string, which
2347 has to be freed by the caller */
2348static PyObject *unicode_encode_call_errorhandler(const char *errors,
2349 PyObject **errorHandler,
2350 const char *encoding, const char *reason,
2351 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2352 int startpos, int endpos,
2353 int *newpos)
2354{
2355 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2356
2357 PyObject *restuple;
2358 PyObject *resunicode;
2359
2360 if (*errorHandler == NULL) {
2361 *errorHandler = PyCodec_LookupError(errors);
2362 if (*errorHandler == NULL)
2363 return NULL;
2364 }
2365
2366 make_encode_exception(exceptionObject,
2367 encoding, unicode, size, startpos, endpos, reason);
2368 if (*exceptionObject == NULL)
2369 return NULL;
2370
2371 restuple = PyObject_CallFunctionObjArgs(
2372 *errorHandler, *exceptionObject, NULL);
2373 if (restuple == NULL)
2374 return NULL;
2375 if (!PyTuple_Check(restuple)) {
2376 PyErr_Format(PyExc_TypeError, &argparse[4]);
2377 Py_DECREF(restuple);
2378 return NULL;
2379 }
2380 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2381 &resunicode, newpos)) {
2382 Py_DECREF(restuple);
2383 return NULL;
2384 }
2385 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002386 *newpos = size+*newpos;
2387 if (*newpos<0 || *newpos>size) {
2388 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2389 Py_DECREF(restuple);
2390 return NULL;
2391 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002392 Py_INCREF(resunicode);
2393 Py_DECREF(restuple);
2394 return resunicode;
2395}
2396
2397static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2398 int size,
2399 const char *errors,
2400 int limit)
2401{
2402 /* output object */
2403 PyObject *res;
2404 /* pointers to the beginning and end+1 of input */
2405 const Py_UNICODE *startp = p;
2406 const Py_UNICODE *endp = p + size;
2407 /* pointer to the beginning of the unencodable characters */
2408 /* const Py_UNICODE *badp = NULL; */
2409 /* pointer into the output */
2410 char *str;
2411 /* current output position */
2412 int respos = 0;
2413 int ressize;
2414 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2415 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2416 PyObject *errorHandler = NULL;
2417 PyObject *exc = NULL;
2418 /* the following variable is used for caching string comparisons
2419 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2420 int known_errorHandler = -1;
2421
2422 /* allocate enough for a simple encoding without
2423 replacements, if we need more, we'll resize */
2424 res = PyString_FromStringAndSize(NULL, size);
2425 if (res == NULL)
2426 goto onError;
2427 if (size == 0)
2428 return res;
2429 str = PyString_AS_STRING(res);
2430 ressize = size;
2431
2432 while (p<endp) {
2433 Py_UNICODE c = *p;
2434
2435 /* can we encode this? */
2436 if (c<limit) {
2437 /* no overflow check, because we know that the space is enough */
2438 *str++ = (char)c;
2439 ++p;
2440 }
2441 else {
2442 int unicodepos = p-startp;
2443 int requiredsize;
2444 PyObject *repunicode;
2445 int repsize;
2446 int newpos;
2447 int respos;
2448 Py_UNICODE *uni2;
2449 /* startpos for collecting unencodable chars */
2450 const Py_UNICODE *collstart = p;
2451 const Py_UNICODE *collend = p;
2452 /* find all unecodable characters */
2453 while ((collend < endp) && ((*collend)>=limit))
2454 ++collend;
2455 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2456 if (known_errorHandler==-1) {
2457 if ((errors==NULL) || (!strcmp(errors, "strict")))
2458 known_errorHandler = 1;
2459 else if (!strcmp(errors, "replace"))
2460 known_errorHandler = 2;
2461 else if (!strcmp(errors, "ignore"))
2462 known_errorHandler = 3;
2463 else if (!strcmp(errors, "xmlcharrefreplace"))
2464 known_errorHandler = 4;
2465 else
2466 known_errorHandler = 0;
2467 }
2468 switch (known_errorHandler) {
2469 case 1: /* strict */
2470 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2471 goto onError;
2472 case 2: /* replace */
2473 while (collstart++<collend)
2474 *str++ = '?'; /* fall through */
2475 case 3: /* ignore */
2476 p = collend;
2477 break;
2478 case 4: /* xmlcharrefreplace */
2479 respos = str-PyString_AS_STRING(res);
2480 /* determine replacement size (temporarily (mis)uses p) */
2481 for (p = collstart, repsize = 0; p < collend; ++p) {
2482 if (*p<10)
2483 repsize += 2+1+1;
2484 else if (*p<100)
2485 repsize += 2+2+1;
2486 else if (*p<1000)
2487 repsize += 2+3+1;
2488 else if (*p<10000)
2489 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002490#ifndef Py_UNICODE_WIDE
2491 else
2492 repsize += 2+5+1;
2493#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002494 else if (*p<100000)
2495 repsize += 2+5+1;
2496 else if (*p<1000000)
2497 repsize += 2+6+1;
2498 else
2499 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002500#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002501 }
2502 requiredsize = respos+repsize+(endp-collend);
2503 if (requiredsize > ressize) {
2504 if (requiredsize<2*ressize)
2505 requiredsize = 2*ressize;
2506 if (_PyString_Resize(&res, requiredsize))
2507 goto onError;
2508 str = PyString_AS_STRING(res) + respos;
2509 ressize = requiredsize;
2510 }
2511 /* generate replacement (temporarily (mis)uses p) */
2512 for (p = collstart; p < collend; ++p) {
2513 str += sprintf(str, "&#%d;", (int)*p);
2514 }
2515 p = collend;
2516 break;
2517 default:
2518 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2519 encoding, reason, startp, size, &exc,
2520 collstart-startp, collend-startp, &newpos);
2521 if (repunicode == NULL)
2522 goto onError;
2523 /* need more space? (at least enough for what we
2524 have+the replacement+the rest of the string, so
2525 we won't have to check space for encodable characters) */
2526 respos = str-PyString_AS_STRING(res);
2527 repsize = PyUnicode_GET_SIZE(repunicode);
2528 requiredsize = respos+repsize+(endp-collend);
2529 if (requiredsize > ressize) {
2530 if (requiredsize<2*ressize)
2531 requiredsize = 2*ressize;
2532 if (_PyString_Resize(&res, requiredsize)) {
2533 Py_DECREF(repunicode);
2534 goto onError;
2535 }
2536 str = PyString_AS_STRING(res) + respos;
2537 ressize = requiredsize;
2538 }
2539 /* check if there is anything unencodable in the replacement
2540 and copy it to the output */
2541 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2542 c = *uni2;
2543 if (c >= limit) {
2544 raise_encode_exception(&exc, encoding, startp, size,
2545 unicodepos, unicodepos+1, reason);
2546 Py_DECREF(repunicode);
2547 goto onError;
2548 }
2549 *str = (char)c;
2550 }
2551 p = startp + newpos;
2552 Py_DECREF(repunicode);
2553 }
2554 }
2555 }
2556 /* Resize if we allocated to much */
2557 respos = str-PyString_AS_STRING(res);
2558 if (respos<ressize)
2559 /* If this falls res will be NULL */
2560 _PyString_Resize(&res, respos);
2561 Py_XDECREF(errorHandler);
2562 Py_XDECREF(exc);
2563 return res;
2564
2565 onError:
2566 Py_XDECREF(res);
2567 Py_XDECREF(errorHandler);
2568 Py_XDECREF(exc);
2569 return NULL;
2570}
2571
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2573 int size,
2574 const char *errors)
2575{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002576 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002577}
2578
2579PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2580{
2581 if (!PyUnicode_Check(unicode)) {
2582 PyErr_BadArgument();
2583 return NULL;
2584 }
2585 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2586 PyUnicode_GET_SIZE(unicode),
2587 NULL);
2588}
2589
2590/* --- 7-bit ASCII Codec -------------------------------------------------- */
2591
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592PyObject *PyUnicode_DecodeASCII(const char *s,
2593 int size,
2594 const char *errors)
2595{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002596 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597 PyUnicodeObject *v;
2598 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002599 int startinpos;
2600 int endinpos;
2601 int outpos;
2602 const char *e;
2603 PyObject *errorHandler = NULL;
2604 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002605
Guido van Rossumd57fd912000-03-10 22:53:23 +00002606 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002607 if (size == 1 && *(unsigned char*)s < 128) {
2608 Py_UNICODE r = *(unsigned char*)s;
2609 return PyUnicode_FromUnicode(&r, 1);
2610 }
Tim Petersced69f82003-09-16 20:30:58 +00002611
Guido van Rossumd57fd912000-03-10 22:53:23 +00002612 v = _PyUnicode_New(size);
2613 if (v == NULL)
2614 goto onError;
2615 if (size == 0)
2616 return (PyObject *)v;
2617 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002618 e = s + size;
2619 while (s < e) {
2620 register unsigned char c = (unsigned char)*s;
2621 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002622 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002623 ++s;
2624 }
2625 else {
2626 startinpos = s-starts;
2627 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002628 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002629 if (unicode_decode_call_errorhandler(
2630 errors, &errorHandler,
2631 "ascii", "ordinal not in range(128)",
2632 starts, size, &startinpos, &endinpos, &exc, &s,
2633 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002635 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002636 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002637 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002638 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002639 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002640 Py_XDECREF(errorHandler);
2641 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002642 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002643
Guido van Rossumd57fd912000-03-10 22:53:23 +00002644 onError:
2645 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002646 Py_XDECREF(errorHandler);
2647 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002648 return NULL;
2649}
2650
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2652 int size,
2653 const char *errors)
2654{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002655 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002656}
2657
2658PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2659{
2660 if (!PyUnicode_Check(unicode)) {
2661 PyErr_BadArgument();
2662 return NULL;
2663 }
2664 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2665 PyUnicode_GET_SIZE(unicode),
2666 NULL);
2667}
2668
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002669#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002670
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002671/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002672
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002673PyObject *PyUnicode_DecodeMBCS(const char *s,
2674 int size,
2675 const char *errors)
2676{
2677 PyUnicodeObject *v;
2678 Py_UNICODE *p;
2679
2680 /* First get the size of the result */
2681 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002682 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002683 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2684
2685 v = _PyUnicode_New(usize);
2686 if (v == NULL)
2687 return NULL;
2688 if (usize == 0)
2689 return (PyObject *)v;
2690 p = PyUnicode_AS_UNICODE(v);
2691 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2692 Py_DECREF(v);
2693 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2694 }
2695
2696 return (PyObject *)v;
2697}
2698
2699PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2700 int size,
2701 const char *errors)
2702{
2703 PyObject *repr;
2704 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002705 DWORD mbcssize;
2706
2707 /* If there are no characters, bail now! */
2708 if (size==0)
2709 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002710
2711 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002712 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002713 if (mbcssize==0)
2714 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2715
2716 repr = PyString_FromStringAndSize(NULL, mbcssize);
2717 if (repr == NULL)
2718 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002719 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002720 return repr;
2721
2722 /* Do the conversion */
2723 s = PyString_AS_STRING(repr);
2724 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2725 Py_DECREF(repr);
2726 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2727 }
2728 return repr;
2729}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002730
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002731PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2732{
2733 if (!PyUnicode_Check(unicode)) {
2734 PyErr_BadArgument();
2735 return NULL;
2736 }
2737 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2738 PyUnicode_GET_SIZE(unicode),
2739 NULL);
2740}
2741
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002742#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002743
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744/* --- Character Mapping Codec -------------------------------------------- */
2745
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746PyObject *PyUnicode_DecodeCharmap(const char *s,
2747 int size,
2748 PyObject *mapping,
2749 const char *errors)
2750{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002751 const char *starts = s;
2752 int startinpos;
2753 int endinpos;
2754 int outpos;
2755 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 PyUnicodeObject *v;
2757 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002758 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002759 PyObject *errorHandler = NULL;
2760 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002761
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 /* Default to Latin-1 */
2763 if (mapping == NULL)
2764 return PyUnicode_DecodeLatin1(s, size, errors);
2765
2766 v = _PyUnicode_New(size);
2767 if (v == NULL)
2768 goto onError;
2769 if (size == 0)
2770 return (PyObject *)v;
2771 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002772 e = s + size;
2773 while (s < e) {
2774 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775 PyObject *w, *x;
2776
2777 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2778 w = PyInt_FromLong((long)ch);
2779 if (w == NULL)
2780 goto onError;
2781 x = PyObject_GetItem(mapping, w);
2782 Py_DECREF(w);
2783 if (x == NULL) {
2784 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002785 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002787 x = Py_None;
2788 Py_INCREF(x);
2789 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002790 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791 }
2792
2793 /* Apply mapping */
2794 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002795 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796 if (value < 0 || value > 65535) {
2797 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002798 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 Py_DECREF(x);
2800 goto onError;
2801 }
2802 *p++ = (Py_UNICODE)value;
2803 }
2804 else if (x == Py_None) {
2805 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002806 outpos = p-PyUnicode_AS_UNICODE(v);
2807 startinpos = s-starts;
2808 endinpos = startinpos+1;
2809 if (unicode_decode_call_errorhandler(
2810 errors, &errorHandler,
2811 "charmap", "character maps to <undefined>",
2812 starts, size, &startinpos, &endinpos, &exc, &s,
2813 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814 Py_DECREF(x);
2815 goto onError;
2816 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002817 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818 }
2819 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002820 int targetsize = PyUnicode_GET_SIZE(x);
2821
2822 if (targetsize == 1)
2823 /* 1-1 mapping */
2824 *p++ = *PyUnicode_AS_UNICODE(x);
2825
2826 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002828 if (targetsize > extrachars) {
2829 /* resize first */
2830 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2831 int needed = (targetsize - extrachars) + \
2832 (targetsize << 2);
2833 extrachars += needed;
Tim Petersced69f82003-09-16 20:30:58 +00002834 if (_PyUnicode_Resize(&v,
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002835 PyUnicode_GET_SIZE(v) + needed) < 0) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002836 Py_DECREF(x);
2837 goto onError;
2838 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002839 p = PyUnicode_AS_UNICODE(v) + oldpos;
2840 }
2841 Py_UNICODE_COPY(p,
2842 PyUnicode_AS_UNICODE(x),
2843 targetsize);
2844 p += targetsize;
2845 extrachars -= targetsize;
2846 }
2847 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848 }
2849 else {
2850 /* wrong return value */
2851 PyErr_SetString(PyExc_TypeError,
2852 "character mapping must return integer, None or unicode");
2853 Py_DECREF(x);
2854 goto onError;
2855 }
2856 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002857 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002858 }
2859 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002860 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002862 Py_XDECREF(errorHandler);
2863 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002865
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002867 Py_XDECREF(errorHandler);
2868 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869 Py_XDECREF(v);
2870 return NULL;
2871}
2872
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002873/* Lookup the character ch in the mapping. If the character
2874 can't be found, Py_None is returned (or NULL, if another
2875 error occured). */
2876static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002878 PyObject *w = PyInt_FromLong((long)c);
2879 PyObject *x;
2880
2881 if (w == NULL)
2882 return NULL;
2883 x = PyObject_GetItem(mapping, w);
2884 Py_DECREF(w);
2885 if (x == NULL) {
2886 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2887 /* No mapping found means: mapping is undefined. */
2888 PyErr_Clear();
2889 x = Py_None;
2890 Py_INCREF(x);
2891 return x;
2892 } else
2893 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002895 else if (x == Py_None)
2896 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002897 else if (PyInt_Check(x)) {
2898 long value = PyInt_AS_LONG(x);
2899 if (value < 0 || value > 255) {
2900 PyErr_SetString(PyExc_TypeError,
2901 "character mapping must be in range(256)");
2902 Py_DECREF(x);
2903 return NULL;
2904 }
2905 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002907 else if (PyString_Check(x))
2908 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002910 /* wrong return value */
2911 PyErr_SetString(PyExc_TypeError,
2912 "character mapping must return integer, None or str");
2913 Py_DECREF(x);
2914 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002915 }
2916}
2917
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002918/* lookup the character, put the result in the output string and adjust
2919 various state variables. Reallocate the output string if not enough
2920 space is available. Return a new reference to the object that
2921 was put in the output buffer, or Py_None, if the mapping was undefined
2922 (in which case no character was written) or NULL, if a
2923 reallocation error ocurred. The called must decref the result */
2924static
2925PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2926 PyObject **outobj, int *outpos)
2927{
2928 PyObject *rep = charmapencode_lookup(c, mapping);
2929
2930 if (rep==NULL)
2931 return NULL;
2932 else if (rep==Py_None)
2933 return rep;
2934 else {
2935 char *outstart = PyString_AS_STRING(*outobj);
2936 int outsize = PyString_GET_SIZE(*outobj);
2937 if (PyInt_Check(rep)) {
2938 int requiredsize = *outpos+1;
2939 if (outsize<requiredsize) {
2940 /* exponentially overallocate to minimize reallocations */
2941 if (requiredsize < 2*outsize)
2942 requiredsize = 2*outsize;
2943 if (_PyString_Resize(outobj, requiredsize)) {
2944 Py_DECREF(rep);
2945 return NULL;
2946 }
2947 outstart = PyString_AS_STRING(*outobj);
2948 }
2949 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2950 }
2951 else {
2952 const char *repchars = PyString_AS_STRING(rep);
2953 int repsize = PyString_GET_SIZE(rep);
2954 int requiredsize = *outpos+repsize;
2955 if (outsize<requiredsize) {
2956 /* exponentially overallocate to minimize reallocations */
2957 if (requiredsize < 2*outsize)
2958 requiredsize = 2*outsize;
2959 if (_PyString_Resize(outobj, requiredsize)) {
2960 Py_DECREF(rep);
2961 return NULL;
2962 }
2963 outstart = PyString_AS_STRING(*outobj);
2964 }
2965 memcpy(outstart + *outpos, repchars, repsize);
2966 *outpos += repsize;
2967 }
2968 }
2969 return rep;
2970}
2971
2972/* handle an error in PyUnicode_EncodeCharmap
2973 Return 0 on success, -1 on error */
2974static
2975int charmap_encoding_error(
2976 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2977 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002978 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002979 PyObject **res, int *respos)
2980{
2981 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2982 int repsize;
2983 int newpos;
2984 Py_UNICODE *uni2;
2985 /* startpos for collecting unencodable chars */
2986 int collstartpos = *inpos;
2987 int collendpos = *inpos+1;
2988 int collpos;
2989 char *encoding = "charmap";
2990 char *reason = "character maps to <undefined>";
2991
2992 PyObject *x;
2993 /* find all unencodable characters */
2994 while (collendpos < size) {
2995 x = charmapencode_lookup(p[collendpos], mapping);
2996 if (x==NULL)
2997 return -1;
2998 else if (x!=Py_None) {
2999 Py_DECREF(x);
3000 break;
3001 }
3002 Py_DECREF(x);
3003 ++collendpos;
3004 }
3005 /* cache callback name lookup
3006 * (if not done yet, i.e. it's the first error) */
3007 if (*known_errorHandler==-1) {
3008 if ((errors==NULL) || (!strcmp(errors, "strict")))
3009 *known_errorHandler = 1;
3010 else if (!strcmp(errors, "replace"))
3011 *known_errorHandler = 2;
3012 else if (!strcmp(errors, "ignore"))
3013 *known_errorHandler = 3;
3014 else if (!strcmp(errors, "xmlcharrefreplace"))
3015 *known_errorHandler = 4;
3016 else
3017 *known_errorHandler = 0;
3018 }
3019 switch (*known_errorHandler) {
3020 case 1: /* strict */
3021 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3022 return -1;
3023 case 2: /* replace */
3024 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3025 x = charmapencode_output('?', mapping, res, respos);
3026 if (x==NULL) {
3027 return -1;
3028 }
3029 else if (x==Py_None) {
3030 Py_DECREF(x);
3031 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3032 return -1;
3033 }
3034 Py_DECREF(x);
3035 }
3036 /* fall through */
3037 case 3: /* ignore */
3038 *inpos = collendpos;
3039 break;
3040 case 4: /* xmlcharrefreplace */
3041 /* generate replacement (temporarily (mis)uses p) */
3042 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3043 char buffer[2+29+1+1];
3044 char *cp;
3045 sprintf(buffer, "&#%d;", (int)p[collpos]);
3046 for (cp = buffer; *cp; ++cp) {
3047 x = charmapencode_output(*cp, mapping, res, respos);
3048 if (x==NULL)
3049 return -1;
3050 else if (x==Py_None) {
3051 Py_DECREF(x);
3052 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3053 return -1;
3054 }
3055 Py_DECREF(x);
3056 }
3057 }
3058 *inpos = collendpos;
3059 break;
3060 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003061 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003062 encoding, reason, p, size, exceptionObject,
3063 collstartpos, collendpos, &newpos);
3064 if (repunicode == NULL)
3065 return -1;
3066 /* generate replacement */
3067 repsize = PyUnicode_GET_SIZE(repunicode);
3068 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3069 x = charmapencode_output(*uni2, mapping, res, respos);
3070 if (x==NULL) {
3071 Py_DECREF(repunicode);
3072 return -1;
3073 }
3074 else if (x==Py_None) {
3075 Py_DECREF(repunicode);
3076 Py_DECREF(x);
3077 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3078 return -1;
3079 }
3080 Py_DECREF(x);
3081 }
3082 *inpos = newpos;
3083 Py_DECREF(repunicode);
3084 }
3085 return 0;
3086}
3087
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3089 int size,
3090 PyObject *mapping,
3091 const char *errors)
3092{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003093 /* output object */
3094 PyObject *res = NULL;
3095 /* current input position */
3096 int inpos = 0;
3097 /* current output position */
3098 int respos = 0;
3099 PyObject *errorHandler = NULL;
3100 PyObject *exc = NULL;
3101 /* the following variable is used for caching string comparisons
3102 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3103 * 3=ignore, 4=xmlcharrefreplace */
3104 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105
3106 /* Default to Latin-1 */
3107 if (mapping == NULL)
3108 return PyUnicode_EncodeLatin1(p, size, errors);
3109
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003110 /* allocate enough for a simple encoding without
3111 replacements, if we need more, we'll resize */
3112 res = PyString_FromStringAndSize(NULL, size);
3113 if (res == NULL)
3114 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003115 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003116 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003117
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003118 while (inpos<size) {
3119 /* try to encode it */
3120 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3121 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003123 if (x==Py_None) { /* unencodable character */
3124 if (charmap_encoding_error(p, size, &inpos, mapping,
3125 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003126 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003127 &res, &respos)) {
3128 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003129 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003130 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003132 else
3133 /* done with this character => adjust input position */
3134 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003135 Py_DECREF(x);
3136 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003138 /* Resize if we allocated to much */
3139 if (respos<PyString_GET_SIZE(res)) {
3140 if (_PyString_Resize(&res, respos))
3141 goto onError;
3142 }
3143 Py_XDECREF(exc);
3144 Py_XDECREF(errorHandler);
3145 return res;
3146
3147 onError:
3148 Py_XDECREF(res);
3149 Py_XDECREF(exc);
3150 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151 return NULL;
3152}
3153
3154PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3155 PyObject *mapping)
3156{
3157 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3158 PyErr_BadArgument();
3159 return NULL;
3160 }
3161 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3162 PyUnicode_GET_SIZE(unicode),
3163 mapping,
3164 NULL);
3165}
3166
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003167/* create or adjust a UnicodeTranslateError */
3168static void make_translate_exception(PyObject **exceptionObject,
3169 const Py_UNICODE *unicode, int size,
3170 int startpos, int endpos,
3171 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003173 if (*exceptionObject == NULL) {
3174 *exceptionObject = PyUnicodeTranslateError_Create(
3175 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 }
3177 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003178 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3179 goto onError;
3180 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3181 goto onError;
3182 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3183 goto onError;
3184 return;
3185 onError:
3186 Py_DECREF(*exceptionObject);
3187 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003188 }
3189}
3190
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003191/* raises a UnicodeTranslateError */
3192static void raise_translate_exception(PyObject **exceptionObject,
3193 const Py_UNICODE *unicode, int size,
3194 int startpos, int endpos,
3195 const char *reason)
3196{
3197 make_translate_exception(exceptionObject,
3198 unicode, size, startpos, endpos, reason);
3199 if (*exceptionObject != NULL)
3200 PyCodec_StrictErrors(*exceptionObject);
3201}
3202
3203/* error handling callback helper:
3204 build arguments, call the callback and check the arguments,
3205 put the result into newpos and return the replacement string, which
3206 has to be freed by the caller */
3207static PyObject *unicode_translate_call_errorhandler(const char *errors,
3208 PyObject **errorHandler,
3209 const char *reason,
3210 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3211 int startpos, int endpos,
3212 int *newpos)
3213{
3214 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3215
3216 PyObject *restuple;
3217 PyObject *resunicode;
3218
3219 if (*errorHandler == NULL) {
3220 *errorHandler = PyCodec_LookupError(errors);
3221 if (*errorHandler == NULL)
3222 return NULL;
3223 }
3224
3225 make_translate_exception(exceptionObject,
3226 unicode, size, startpos, endpos, reason);
3227 if (*exceptionObject == NULL)
3228 return NULL;
3229
3230 restuple = PyObject_CallFunctionObjArgs(
3231 *errorHandler, *exceptionObject, NULL);
3232 if (restuple == NULL)
3233 return NULL;
3234 if (!PyTuple_Check(restuple)) {
3235 PyErr_Format(PyExc_TypeError, &argparse[4]);
3236 Py_DECREF(restuple);
3237 return NULL;
3238 }
3239 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3240 &resunicode, newpos)) {
3241 Py_DECREF(restuple);
3242 return NULL;
3243 }
3244 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003245 *newpos = size+*newpos;
3246 if (*newpos<0 || *newpos>size) {
3247 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3248 Py_DECREF(restuple);
3249 return NULL;
3250 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003251 Py_INCREF(resunicode);
3252 Py_DECREF(restuple);
3253 return resunicode;
3254}
3255
3256/* Lookup the character ch in the mapping and put the result in result,
3257 which must be decrefed by the caller.
3258 Return 0 on success, -1 on error */
3259static
3260int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3261{
3262 PyObject *w = PyInt_FromLong((long)c);
3263 PyObject *x;
3264
3265 if (w == NULL)
3266 return -1;
3267 x = PyObject_GetItem(mapping, w);
3268 Py_DECREF(w);
3269 if (x == NULL) {
3270 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3271 /* No mapping found means: use 1:1 mapping. */
3272 PyErr_Clear();
3273 *result = NULL;
3274 return 0;
3275 } else
3276 return -1;
3277 }
3278 else if (x == Py_None) {
3279 *result = x;
3280 return 0;
3281 }
3282 else if (PyInt_Check(x)) {
3283 long value = PyInt_AS_LONG(x);
3284 long max = PyUnicode_GetMax();
3285 if (value < 0 || value > max) {
3286 PyErr_Format(PyExc_TypeError,
3287 "character mapping must be in range(0x%lx)", max+1);
3288 Py_DECREF(x);
3289 return -1;
3290 }
3291 *result = x;
3292 return 0;
3293 }
3294 else if (PyUnicode_Check(x)) {
3295 *result = x;
3296 return 0;
3297 }
3298 else {
3299 /* wrong return value */
3300 PyErr_SetString(PyExc_TypeError,
3301 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003302 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003303 return -1;
3304 }
3305}
3306/* ensure that *outobj is at least requiredsize characters long,
3307if not reallocate and adjust various state variables.
3308Return 0 on success, -1 on error */
3309static
Walter Dörwald4894c302003-10-24 14:25:28 +00003310int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003311 int requiredsize)
3312{
Walter Dörwald4894c302003-10-24 14:25:28 +00003313 int oldsize = PyUnicode_GET_SIZE(*outobj);
3314 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003315 /* remember old output position */
3316 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3317 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003318 if (requiredsize < 2 * oldsize)
3319 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003320 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003321 return -1;
3322 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003323 }
3324 return 0;
3325}
3326/* lookup the character, put the result in the output string and adjust
3327 various state variables. Return a new reference to the object that
3328 was put in the output buffer in *result, or Py_None, if the mapping was
3329 undefined (in which case no character was written).
3330 The called must decref result.
3331 Return 0 on success, -1 on error. */
3332static
Walter Dörwald4894c302003-10-24 14:25:28 +00003333int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3334 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3335 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003336{
Walter Dörwald4894c302003-10-24 14:25:28 +00003337 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003338 return -1;
3339 if (*res==NULL) {
3340 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003341 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342 }
3343 else if (*res==Py_None)
3344 ;
3345 else if (PyInt_Check(*res)) {
3346 /* no overflow check, because we know that the space is enough */
3347 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3348 }
3349 else if (PyUnicode_Check(*res)) {
3350 int repsize = PyUnicode_GET_SIZE(*res);
3351 if (repsize==1) {
3352 /* no overflow check, because we know that the space is enough */
3353 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3354 }
3355 else if (repsize!=0) {
3356 /* more than one character */
Walter Dörwald4894c302003-10-24 14:25:28 +00003357 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003358 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003359 repsize - 1;
3360 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003361 return -1;
3362 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3363 *outp += repsize;
3364 }
3365 }
3366 else
3367 return -1;
3368 return 0;
3369}
3370
3371PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372 int size,
3373 PyObject *mapping,
3374 const char *errors)
3375{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003376 /* output object */
3377 PyObject *res = NULL;
3378 /* pointers to the beginning and end+1 of input */
3379 const Py_UNICODE *startp = p;
3380 const Py_UNICODE *endp = p + size;
3381 /* pointer into the output */
3382 Py_UNICODE *str;
3383 /* current output position */
3384 int respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003385 char *reason = "character maps to <undefined>";
3386 PyObject *errorHandler = NULL;
3387 PyObject *exc = NULL;
3388 /* the following variable is used for caching string comparisons
3389 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3390 * 3=ignore, 4=xmlcharrefreplace */
3391 int known_errorHandler = -1;
3392
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393 if (mapping == NULL) {
3394 PyErr_BadArgument();
3395 return NULL;
3396 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003397
3398 /* allocate enough for a simple 1:1 translation without
3399 replacements, if we need more, we'll resize */
3400 res = PyUnicode_FromUnicode(NULL, size);
3401 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003402 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003404 return res;
3405 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003406
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003407 while (p<endp) {
3408 /* try to encode it */
3409 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003410 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003411 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003412 goto onError;
3413 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003414 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003415 if (x!=Py_None) /* it worked => adjust input pointer */
3416 ++p;
3417 else { /* untranslatable character */
3418 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3419 int repsize;
3420 int newpos;
3421 Py_UNICODE *uni2;
3422 /* startpos for collecting untranslatable chars */
3423 const Py_UNICODE *collstart = p;
3424 const Py_UNICODE *collend = p+1;
3425 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003426
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003427 /* find all untranslatable characters */
3428 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003429 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 goto onError;
3431 Py_XDECREF(x);
3432 if (x!=Py_None)
3433 break;
3434 ++collend;
3435 }
3436 /* cache callback name lookup
3437 * (if not done yet, i.e. it's the first error) */
3438 if (known_errorHandler==-1) {
3439 if ((errors==NULL) || (!strcmp(errors, "strict")))
3440 known_errorHandler = 1;
3441 else if (!strcmp(errors, "replace"))
3442 known_errorHandler = 2;
3443 else if (!strcmp(errors, "ignore"))
3444 known_errorHandler = 3;
3445 else if (!strcmp(errors, "xmlcharrefreplace"))
3446 known_errorHandler = 4;
3447 else
3448 known_errorHandler = 0;
3449 }
3450 switch (known_errorHandler) {
3451 case 1: /* strict */
3452 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3453 goto onError;
3454 case 2: /* replace */
3455 /* No need to check for space, this is a 1:1 replacement */
3456 for (coll = collstart; coll<collend; ++coll)
3457 *str++ = '?';
3458 /* fall through */
3459 case 3: /* ignore */
3460 p = collend;
3461 break;
3462 case 4: /* xmlcharrefreplace */
3463 /* generate replacement (temporarily (mis)uses p) */
3464 for (p = collstart; p < collend; ++p) {
3465 char buffer[2+29+1+1];
3466 char *cp;
3467 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003468 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003469 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3470 goto onError;
3471 for (cp = buffer; *cp; ++cp)
3472 *str++ = *cp;
3473 }
3474 p = collend;
3475 break;
3476 default:
3477 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3478 reason, startp, size, &exc,
3479 collstart-startp, collend-startp, &newpos);
3480 if (repunicode == NULL)
3481 goto onError;
3482 /* generate replacement */
3483 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003484 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003485 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3486 Py_DECREF(repunicode);
3487 goto onError;
3488 }
3489 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3490 *str++ = *uni2;
3491 p = startp + newpos;
3492 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003493 }
3494 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003495 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003496 /* Resize if we allocated to much */
3497 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003498 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003499 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003500 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003501 }
3502 Py_XDECREF(exc);
3503 Py_XDECREF(errorHandler);
3504 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003505
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506 onError:
3507 Py_XDECREF(res);
3508 Py_XDECREF(exc);
3509 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003510 return NULL;
3511}
3512
3513PyObject *PyUnicode_Translate(PyObject *str,
3514 PyObject *mapping,
3515 const char *errors)
3516{
3517 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003518
Guido van Rossumd57fd912000-03-10 22:53:23 +00003519 str = PyUnicode_FromObject(str);
3520 if (str == NULL)
3521 goto onError;
3522 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3523 PyUnicode_GET_SIZE(str),
3524 mapping,
3525 errors);
3526 Py_DECREF(str);
3527 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003528
Guido van Rossumd57fd912000-03-10 22:53:23 +00003529 onError:
3530 Py_XDECREF(str);
3531 return NULL;
3532}
Tim Petersced69f82003-09-16 20:30:58 +00003533
Guido van Rossum9e896b32000-04-05 20:11:21 +00003534/* --- Decimal Encoder ---------------------------------------------------- */
3535
3536int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3537 int length,
3538 char *output,
3539 const char *errors)
3540{
3541 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542 PyObject *errorHandler = NULL;
3543 PyObject *exc = NULL;
3544 const char *encoding = "decimal";
3545 const char *reason = "invalid decimal Unicode string";
3546 /* the following variable is used for caching string comparisons
3547 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3548 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003549
3550 if (output == NULL) {
3551 PyErr_BadArgument();
3552 return -1;
3553 }
3554
3555 p = s;
3556 end = s + length;
3557 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003558 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003559 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560 PyObject *repunicode;
3561 int repsize;
3562 int newpos;
3563 Py_UNICODE *uni2;
3564 Py_UNICODE *collstart;
3565 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003566
Guido van Rossum9e896b32000-04-05 20:11:21 +00003567 if (Py_UNICODE_ISSPACE(ch)) {
3568 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003570 continue;
3571 }
3572 decimal = Py_UNICODE_TODECIMAL(ch);
3573 if (decimal >= 0) {
3574 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003575 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003576 continue;
3577 }
Guido van Rossumba477042000-04-06 18:18:10 +00003578 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003579 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003580 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003581 continue;
3582 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003583 /* All other characters are considered unencodable */
3584 collstart = p;
3585 collend = p+1;
3586 while (collend < end) {
3587 if ((0 < *collend && *collend < 256) ||
3588 !Py_UNICODE_ISSPACE(*collend) ||
3589 Py_UNICODE_TODECIMAL(*collend))
3590 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003591 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592 /* cache callback name lookup
3593 * (if not done yet, i.e. it's the first error) */
3594 if (known_errorHandler==-1) {
3595 if ((errors==NULL) || (!strcmp(errors, "strict")))
3596 known_errorHandler = 1;
3597 else if (!strcmp(errors, "replace"))
3598 known_errorHandler = 2;
3599 else if (!strcmp(errors, "ignore"))
3600 known_errorHandler = 3;
3601 else if (!strcmp(errors, "xmlcharrefreplace"))
3602 known_errorHandler = 4;
3603 else
3604 known_errorHandler = 0;
3605 }
3606 switch (known_errorHandler) {
3607 case 1: /* strict */
3608 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3609 goto onError;
3610 case 2: /* replace */
3611 for (p = collstart; p < collend; ++p)
3612 *output++ = '?';
3613 /* fall through */
3614 case 3: /* ignore */
3615 p = collend;
3616 break;
3617 case 4: /* xmlcharrefreplace */
3618 /* generate replacement (temporarily (mis)uses p) */
3619 for (p = collstart; p < collend; ++p)
3620 output += sprintf(output, "&#%d;", (int)*p);
3621 p = collend;
3622 break;
3623 default:
3624 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3625 encoding, reason, s, length, &exc,
3626 collstart-s, collend-s, &newpos);
3627 if (repunicode == NULL)
3628 goto onError;
3629 /* generate replacement */
3630 repsize = PyUnicode_GET_SIZE(repunicode);
3631 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3632 Py_UNICODE ch = *uni2;
3633 if (Py_UNICODE_ISSPACE(ch))
3634 *output++ = ' ';
3635 else {
3636 decimal = Py_UNICODE_TODECIMAL(ch);
3637 if (decimal >= 0)
3638 *output++ = '0' + decimal;
3639 else if (0 < ch && ch < 256)
3640 *output++ = (char)ch;
3641 else {
3642 Py_DECREF(repunicode);
3643 raise_encode_exception(&exc, encoding,
3644 s, length, collstart-s, collend-s, reason);
3645 goto onError;
3646 }
3647 }
3648 }
3649 p = s + newpos;
3650 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003651 }
3652 }
3653 /* 0-terminate the output string */
3654 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655 Py_XDECREF(exc);
3656 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003657 return 0;
3658
3659 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 Py_XDECREF(exc);
3661 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003662 return -1;
3663}
3664
Guido van Rossumd57fd912000-03-10 22:53:23 +00003665/* --- Helpers ------------------------------------------------------------ */
3666
Tim Petersced69f82003-09-16 20:30:58 +00003667static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668int count(PyUnicodeObject *self,
3669 int start,
3670 int end,
3671 PyUnicodeObject *substring)
3672{
3673 int count = 0;
3674
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003675 if (start < 0)
3676 start += self->length;
3677 if (start < 0)
3678 start = 0;
3679 if (end > self->length)
3680 end = self->length;
3681 if (end < 0)
3682 end += self->length;
3683 if (end < 0)
3684 end = 0;
3685
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003686 if (substring->length == 0)
3687 return (end - start + 1);
3688
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 end -= substring->length;
3690
3691 while (start <= end)
3692 if (Py_UNICODE_MATCH(self, start, substring)) {
3693 count++;
3694 start += substring->length;
3695 } else
3696 start++;
3697
3698 return count;
3699}
3700
3701int PyUnicode_Count(PyObject *str,
3702 PyObject *substr,
3703 int start,
3704 int end)
3705{
3706 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003707
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 str = PyUnicode_FromObject(str);
3709 if (str == NULL)
3710 return -1;
3711 substr = PyUnicode_FromObject(substr);
3712 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003713 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003714 return -1;
3715 }
Tim Petersced69f82003-09-16 20:30:58 +00003716
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717 result = count((PyUnicodeObject *)str,
3718 start, end,
3719 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003720
Guido van Rossumd57fd912000-03-10 22:53:23 +00003721 Py_DECREF(str);
3722 Py_DECREF(substr);
3723 return result;
3724}
3725
Tim Petersced69f82003-09-16 20:30:58 +00003726static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003727int findstring(PyUnicodeObject *self,
3728 PyUnicodeObject *substring,
3729 int start,
3730 int end,
3731 int direction)
3732{
3733 if (start < 0)
3734 start += self->length;
3735 if (start < 0)
3736 start = 0;
3737
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738 if (end > self->length)
3739 end = self->length;
3740 if (end < 0)
3741 end += self->length;
3742 if (end < 0)
3743 end = 0;
3744
Guido van Rossum76afbd92002-08-20 17:29:29 +00003745 if (substring->length == 0)
3746 return (direction > 0) ? start : end;
3747
Guido van Rossumd57fd912000-03-10 22:53:23 +00003748 end -= substring->length;
3749
3750 if (direction < 0) {
3751 for (; end >= start; end--)
3752 if (Py_UNICODE_MATCH(self, end, substring))
3753 return end;
3754 } else {
3755 for (; start <= end; start++)
3756 if (Py_UNICODE_MATCH(self, start, substring))
3757 return start;
3758 }
3759
3760 return -1;
3761}
3762
3763int PyUnicode_Find(PyObject *str,
3764 PyObject *substr,
3765 int start,
3766 int end,
3767 int direction)
3768{
3769 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003770
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771 str = PyUnicode_FromObject(str);
3772 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003773 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003774 substr = PyUnicode_FromObject(substr);
3775 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003776 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003777 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778 }
Tim Petersced69f82003-09-16 20:30:58 +00003779
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 result = findstring((PyUnicodeObject *)str,
3781 (PyUnicodeObject *)substr,
3782 start, end, direction);
3783 Py_DECREF(str);
3784 Py_DECREF(substr);
3785 return result;
3786}
3787
Tim Petersced69f82003-09-16 20:30:58 +00003788static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789int tailmatch(PyUnicodeObject *self,
3790 PyUnicodeObject *substring,
3791 int start,
3792 int end,
3793 int direction)
3794{
3795 if (start < 0)
3796 start += self->length;
3797 if (start < 0)
3798 start = 0;
3799
3800 if (substring->length == 0)
3801 return 1;
3802
3803 if (end > self->length)
3804 end = self->length;
3805 if (end < 0)
3806 end += self->length;
3807 if (end < 0)
3808 end = 0;
3809
3810 end -= substring->length;
3811 if (end < start)
3812 return 0;
3813
3814 if (direction > 0) {
3815 if (Py_UNICODE_MATCH(self, end, substring))
3816 return 1;
3817 } else {
3818 if (Py_UNICODE_MATCH(self, start, substring))
3819 return 1;
3820 }
3821
3822 return 0;
3823}
3824
3825int PyUnicode_Tailmatch(PyObject *str,
3826 PyObject *substr,
3827 int start,
3828 int end,
3829 int direction)
3830{
3831 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003832
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833 str = PyUnicode_FromObject(str);
3834 if (str == NULL)
3835 return -1;
3836 substr = PyUnicode_FromObject(substr);
3837 if (substr == NULL) {
3838 Py_DECREF(substr);
3839 return -1;
3840 }
Tim Petersced69f82003-09-16 20:30:58 +00003841
Guido van Rossumd57fd912000-03-10 22:53:23 +00003842 result = tailmatch((PyUnicodeObject *)str,
3843 (PyUnicodeObject *)substr,
3844 start, end, direction);
3845 Py_DECREF(str);
3846 Py_DECREF(substr);
3847 return result;
3848}
3849
Tim Petersced69f82003-09-16 20:30:58 +00003850static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851const Py_UNICODE *findchar(const Py_UNICODE *s,
3852 int size,
3853 Py_UNICODE ch)
3854{
3855 /* like wcschr, but doesn't stop at NULL characters */
3856
3857 while (size-- > 0) {
3858 if (*s == ch)
3859 return s;
3860 s++;
3861 }
3862
3863 return NULL;
3864}
3865
3866/* Apply fixfct filter to the Unicode object self and return a
3867 reference to the modified object */
3868
Tim Petersced69f82003-09-16 20:30:58 +00003869static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870PyObject *fixup(PyUnicodeObject *self,
3871 int (*fixfct)(PyUnicodeObject *s))
3872{
3873
3874 PyUnicodeObject *u;
3875
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003876 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003877 if (u == NULL)
3878 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003879
3880 Py_UNICODE_COPY(u->str, self->str, self->length);
3881
Tim Peters7a29bd52001-09-12 03:03:31 +00003882 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003883 /* fixfct should return TRUE if it modified the buffer. If
3884 FALSE, return a reference to the original buffer instead
3885 (to save space, not time) */
3886 Py_INCREF(self);
3887 Py_DECREF(u);
3888 return (PyObject*) self;
3889 }
3890 return (PyObject*) u;
3891}
3892
Tim Petersced69f82003-09-16 20:30:58 +00003893static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003894int fixupper(PyUnicodeObject *self)
3895{
3896 int len = self->length;
3897 Py_UNICODE *s = self->str;
3898 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003899
Guido van Rossumd57fd912000-03-10 22:53:23 +00003900 while (len-- > 0) {
3901 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003902
Guido van Rossumd57fd912000-03-10 22:53:23 +00003903 ch = Py_UNICODE_TOUPPER(*s);
3904 if (ch != *s) {
3905 status = 1;
3906 *s = ch;
3907 }
3908 s++;
3909 }
3910
3911 return status;
3912}
3913
Tim Petersced69f82003-09-16 20:30:58 +00003914static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003915int fixlower(PyUnicodeObject *self)
3916{
3917 int len = self->length;
3918 Py_UNICODE *s = self->str;
3919 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003920
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921 while (len-- > 0) {
3922 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003923
Guido van Rossumd57fd912000-03-10 22:53:23 +00003924 ch = Py_UNICODE_TOLOWER(*s);
3925 if (ch != *s) {
3926 status = 1;
3927 *s = ch;
3928 }
3929 s++;
3930 }
3931
3932 return status;
3933}
3934
Tim Petersced69f82003-09-16 20:30:58 +00003935static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936int fixswapcase(PyUnicodeObject *self)
3937{
3938 int len = self->length;
3939 Py_UNICODE *s = self->str;
3940 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003941
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942 while (len-- > 0) {
3943 if (Py_UNICODE_ISUPPER(*s)) {
3944 *s = Py_UNICODE_TOLOWER(*s);
3945 status = 1;
3946 } else if (Py_UNICODE_ISLOWER(*s)) {
3947 *s = Py_UNICODE_TOUPPER(*s);
3948 status = 1;
3949 }
3950 s++;
3951 }
3952
3953 return status;
3954}
3955
Tim Petersced69f82003-09-16 20:30:58 +00003956static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957int fixcapitalize(PyUnicodeObject *self)
3958{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003959 int len = self->length;
3960 Py_UNICODE *s = self->str;
3961 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003962
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003963 if (len == 0)
3964 return 0;
3965 if (Py_UNICODE_ISLOWER(*s)) {
3966 *s = Py_UNICODE_TOUPPER(*s);
3967 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003969 s++;
3970 while (--len > 0) {
3971 if (Py_UNICODE_ISUPPER(*s)) {
3972 *s = Py_UNICODE_TOLOWER(*s);
3973 status = 1;
3974 }
3975 s++;
3976 }
3977 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978}
3979
3980static
3981int fixtitle(PyUnicodeObject *self)
3982{
3983 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3984 register Py_UNICODE *e;
3985 int previous_is_cased;
3986
3987 /* Shortcut for single character strings */
3988 if (PyUnicode_GET_SIZE(self) == 1) {
3989 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3990 if (*p != ch) {
3991 *p = ch;
3992 return 1;
3993 }
3994 else
3995 return 0;
3996 }
Tim Petersced69f82003-09-16 20:30:58 +00003997
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 e = p + PyUnicode_GET_SIZE(self);
3999 previous_is_cased = 0;
4000 for (; p < e; p++) {
4001 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004002
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003 if (previous_is_cased)
4004 *p = Py_UNICODE_TOLOWER(ch);
4005 else
4006 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004007
4008 if (Py_UNICODE_ISLOWER(ch) ||
4009 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 Py_UNICODE_ISTITLE(ch))
4011 previous_is_cased = 1;
4012 else
4013 previous_is_cased = 0;
4014 }
4015 return 1;
4016}
4017
Tim Peters8ce9f162004-08-27 01:49:32 +00004018PyObject *
4019PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004020{
Tim Peters8ce9f162004-08-27 01:49:32 +00004021 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004022 const Py_UNICODE blank = ' ';
4023 const Py_UNICODE *sep = &blank;
4024 size_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004025 PyUnicodeObject *res = NULL; /* the result */
4026 size_t res_alloc = 100; /* # allocated bytes for string in res */
4027 size_t res_used; /* # used bytes */
4028 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4029 PyObject *fseq; /* PySequence_Fast(seq) */
4030 int seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004031 PyObject *item;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004032 int i;
4033
Tim Peters05eba1f2004-08-27 21:32:02 +00004034 fseq = PySequence_Fast(seq, "");
4035 if (fseq == NULL) {
4036 if (PyErr_ExceptionMatches(PyExc_TypeError))
4037 PyErr_Format(PyExc_TypeError,
4038 "sequence expected, %.80s found",
4039 seq->ob_type->tp_name);
4040 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004041 }
4042
Tim Peters91879ab2004-08-27 22:35:44 +00004043 /* Grrrr. A codec may be invoked to convert str objects to
4044 * Unicode, and so it's possible to call back into Python code
4045 * during PyUnicode_FromObject(), and so it's possible for a sick
4046 * codec to change the size of fseq (if seq is a list). Therefore
4047 * we have to keep refetching the size -- can't assume seqlen
4048 * is invariant.
4049 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004050 seqlen = PySequence_Fast_GET_SIZE(fseq);
4051 /* If empty sequence, return u"". */
4052 if (seqlen == 0) {
4053 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4054 goto Done;
4055 }
4056 /* If singleton sequence with an exact Unicode, return that. */
4057 if (seqlen == 1) {
4058 item = PySequence_Fast_GET_ITEM(fseq, 0);
4059 if (PyUnicode_CheckExact(item)) {
4060 Py_INCREF(item);
4061 res = (PyUnicodeObject *)item;
4062 goto Done;
4063 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004064 }
4065
Tim Peters05eba1f2004-08-27 21:32:02 +00004066 /* At least two items to join, or one that isn't exact Unicode. */
4067 if (seqlen > 1) {
4068 /* Set up sep and seplen -- they're needed. */
4069 if (separator == NULL) {
4070 sep = &blank;
4071 seplen = 1;
4072 }
4073 else {
4074 internal_separator = PyUnicode_FromObject(separator);
4075 if (internal_separator == NULL)
4076 goto onError;
4077 sep = PyUnicode_AS_UNICODE(internal_separator);
4078 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004079 /* In case PyUnicode_FromObject() mutated seq. */
4080 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004081 }
4082 }
4083
4084 /* Get space. */
4085 res = _PyUnicode_New((int)res_alloc);
4086 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004087 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004088 res_p = PyUnicode_AS_UNICODE(res);
4089 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004090
Tim Peters05eba1f2004-08-27 21:32:02 +00004091 for (i = 0; i < seqlen; ++i) {
4092 size_t itemlen;
4093 size_t new_res_used;
4094
4095 item = PySequence_Fast_GET_ITEM(fseq, i);
4096 /* Convert item to Unicode. */
4097 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4098 PyErr_Format(PyExc_TypeError,
4099 "sequence item %i: expected string or Unicode,"
4100 " %.80s found",
4101 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004102 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004103 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004104 item = PyUnicode_FromObject(item);
4105 if (item == NULL)
4106 goto onError;
4107 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004108
Tim Peters91879ab2004-08-27 22:35:44 +00004109 /* In case PyUnicode_FromObject() mutated seq. */
4110 seqlen = PySequence_Fast_GET_SIZE(fseq);
4111
Tim Peters8ce9f162004-08-27 01:49:32 +00004112 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004114 new_res_used = res_used + itemlen;
4115 if (new_res_used < res_used || new_res_used > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004116 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004117 if (i < seqlen - 1) {
4118 new_res_used += seplen;
4119 if (new_res_used < res_used || new_res_used > INT_MAX)
4120 goto Overflow;
4121 }
4122 if (new_res_used > res_alloc) {
4123 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004124 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004125 size_t oldsize = res_alloc;
4126 res_alloc += res_alloc;
4127 if (res_alloc < oldsize || res_alloc > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004128 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004129 } while (new_res_used > res_alloc);
4130 if (_PyUnicode_Resize(&res, (int)res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004131 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004133 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004134 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004136
4137 /* Copy item, and maybe the separator. */
4138 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), (int)itemlen);
4139 res_p += itemlen;
4140 if (i < seqlen - 1) {
4141 Py_UNICODE_COPY(res_p, sep, (int)seplen);
4142 res_p += seplen;
4143 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004145 res_used = new_res_used;
4146 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004147
Tim Peters05eba1f2004-08-27 21:32:02 +00004148 /* Shrink res to match the used area; this probably can't fail,
4149 * but it's cheap to check.
4150 */
4151 if (_PyUnicode_Resize(&res, (int)res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004152 goto onError;
4153
4154 Done:
4155 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004156 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004157 return (PyObject *)res;
4158
Tim Peters8ce9f162004-08-27 01:49:32 +00004159 Overflow:
4160 PyErr_SetString(PyExc_OverflowError,
4161 "join() is too long for a Python string");
4162 Py_DECREF(item);
4163 /* fall through */
4164
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004166 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004167 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004168 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169 return NULL;
4170}
4171
Tim Petersced69f82003-09-16 20:30:58 +00004172static
4173PyUnicodeObject *pad(PyUnicodeObject *self,
4174 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175 int right,
4176 Py_UNICODE fill)
4177{
4178 PyUnicodeObject *u;
4179
4180 if (left < 0)
4181 left = 0;
4182 if (right < 0)
4183 right = 0;
4184
Tim Peters7a29bd52001-09-12 03:03:31 +00004185 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186 Py_INCREF(self);
4187 return self;
4188 }
4189
4190 u = _PyUnicode_New(left + self->length + right);
4191 if (u) {
4192 if (left)
4193 Py_UNICODE_FILL(u->str, fill, left);
4194 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4195 if (right)
4196 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4197 }
4198
4199 return u;
4200}
4201
4202#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004203 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004204 if (!str) \
4205 goto onError; \
4206 if (PyList_Append(list, str)) { \
4207 Py_DECREF(str); \
4208 goto onError; \
4209 } \
4210 else \
4211 Py_DECREF(str);
4212
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004213#define SPLIT_INSERT(data, left, right) \
4214 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4215 if (!str) \
4216 goto onError; \
4217 if (PyList_Insert(list, 0, str)) { \
4218 Py_DECREF(str); \
4219 goto onError; \
4220 } \
4221 else \
4222 Py_DECREF(str);
4223
Guido van Rossumd57fd912000-03-10 22:53:23 +00004224static
4225PyObject *split_whitespace(PyUnicodeObject *self,
4226 PyObject *list,
4227 int maxcount)
4228{
4229 register int i;
4230 register int j;
4231 int len = self->length;
4232 PyObject *str;
4233
4234 for (i = j = 0; i < len; ) {
4235 /* find a token */
4236 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4237 i++;
4238 j = i;
4239 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4240 i++;
4241 if (j < i) {
4242 if (maxcount-- <= 0)
4243 break;
4244 SPLIT_APPEND(self->str, j, i);
4245 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4246 i++;
4247 j = i;
4248 }
4249 }
4250 if (j < len) {
4251 SPLIT_APPEND(self->str, j, len);
4252 }
4253 return list;
4254
4255 onError:
4256 Py_DECREF(list);
4257 return NULL;
4258}
4259
4260PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004261 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004262{
4263 register int i;
4264 register int j;
4265 int len;
4266 PyObject *list;
4267 PyObject *str;
4268 Py_UNICODE *data;
4269
4270 string = PyUnicode_FromObject(string);
4271 if (string == NULL)
4272 return NULL;
4273 data = PyUnicode_AS_UNICODE(string);
4274 len = PyUnicode_GET_SIZE(string);
4275
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276 list = PyList_New(0);
4277 if (!list)
4278 goto onError;
4279
4280 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004281 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004282
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283 /* Find a line and append it */
4284 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4285 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286
4287 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004288 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004289 if (i < len) {
4290 if (data[i] == '\r' && i + 1 < len &&
4291 data[i+1] == '\n')
4292 i += 2;
4293 else
4294 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004295 if (keepends)
4296 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 }
Guido van Rossum86662912000-04-11 15:38:46 +00004298 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299 j = i;
4300 }
4301 if (j < len) {
4302 SPLIT_APPEND(data, j, len);
4303 }
4304
4305 Py_DECREF(string);
4306 return list;
4307
4308 onError:
4309 Py_DECREF(list);
4310 Py_DECREF(string);
4311 return NULL;
4312}
4313
Tim Petersced69f82003-09-16 20:30:58 +00004314static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315PyObject *split_char(PyUnicodeObject *self,
4316 PyObject *list,
4317 Py_UNICODE ch,
4318 int maxcount)
4319{
4320 register int i;
4321 register int j;
4322 int len = self->length;
4323 PyObject *str;
4324
4325 for (i = j = 0; i < len; ) {
4326 if (self->str[i] == ch) {
4327 if (maxcount-- <= 0)
4328 break;
4329 SPLIT_APPEND(self->str, j, i);
4330 i = j = i + 1;
4331 } else
4332 i++;
4333 }
4334 if (j <= len) {
4335 SPLIT_APPEND(self->str, j, len);
4336 }
4337 return list;
4338
4339 onError:
4340 Py_DECREF(list);
4341 return NULL;
4342}
4343
Tim Petersced69f82003-09-16 20:30:58 +00004344static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345PyObject *split_substring(PyUnicodeObject *self,
4346 PyObject *list,
4347 PyUnicodeObject *substring,
4348 int maxcount)
4349{
4350 register int i;
4351 register int j;
4352 int len = self->length;
4353 int sublen = substring->length;
4354 PyObject *str;
4355
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004356 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357 if (Py_UNICODE_MATCH(self, i, substring)) {
4358 if (maxcount-- <= 0)
4359 break;
4360 SPLIT_APPEND(self->str, j, i);
4361 i = j = i + sublen;
4362 } else
4363 i++;
4364 }
4365 if (j <= len) {
4366 SPLIT_APPEND(self->str, j, len);
4367 }
4368 return list;
4369
4370 onError:
4371 Py_DECREF(list);
4372 return NULL;
4373}
4374
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004375static
4376PyObject *rsplit_whitespace(PyUnicodeObject *self,
4377 PyObject *list,
4378 int maxcount)
4379{
4380 register int i;
4381 register int j;
4382 int len = self->length;
4383 PyObject *str;
4384
4385 for (i = j = len - 1; i >= 0; ) {
4386 /* find a token */
4387 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4388 i--;
4389 j = i;
4390 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4391 i--;
4392 if (j > i) {
4393 if (maxcount-- <= 0)
4394 break;
4395 SPLIT_INSERT(self->str, i + 1, j + 1);
4396 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4397 i--;
4398 j = i;
4399 }
4400 }
4401 if (j >= 0) {
4402 SPLIT_INSERT(self->str, 0, j + 1);
4403 }
4404 return list;
4405
4406 onError:
4407 Py_DECREF(list);
4408 return NULL;
4409}
4410
4411static
4412PyObject *rsplit_char(PyUnicodeObject *self,
4413 PyObject *list,
4414 Py_UNICODE ch,
4415 int maxcount)
4416{
4417 register int i;
4418 register int j;
4419 int len = self->length;
4420 PyObject *str;
4421
4422 for (i = j = len - 1; i >= 0; ) {
4423 if (self->str[i] == ch) {
4424 if (maxcount-- <= 0)
4425 break;
4426 SPLIT_INSERT(self->str, i + 1, j + 1);
4427 j = i = i - 1;
4428 } else
4429 i--;
4430 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004431 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004432 SPLIT_INSERT(self->str, 0, j + 1);
4433 }
4434 return list;
4435
4436 onError:
4437 Py_DECREF(list);
4438 return NULL;
4439}
4440
4441static
4442PyObject *rsplit_substring(PyUnicodeObject *self,
4443 PyObject *list,
4444 PyUnicodeObject *substring,
4445 int maxcount)
4446{
4447 register int i;
4448 register int j;
4449 int len = self->length;
4450 int sublen = substring->length;
4451 PyObject *str;
4452
4453 for (i = len - sublen, j = len; i >= 0; ) {
4454 if (Py_UNICODE_MATCH(self, i, substring)) {
4455 if (maxcount-- <= 0)
4456 break;
4457 SPLIT_INSERT(self->str, i + sublen, j);
4458 j = i;
4459 i -= sublen;
4460 } else
4461 i--;
4462 }
4463 if (j >= 0) {
4464 SPLIT_INSERT(self->str, 0, j);
4465 }
4466 return list;
4467
4468 onError:
4469 Py_DECREF(list);
4470 return NULL;
4471}
4472
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004474#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004475
4476static
4477PyObject *split(PyUnicodeObject *self,
4478 PyUnicodeObject *substring,
4479 int maxcount)
4480{
4481 PyObject *list;
4482
4483 if (maxcount < 0)
4484 maxcount = INT_MAX;
4485
4486 list = PyList_New(0);
4487 if (!list)
4488 return NULL;
4489
4490 if (substring == NULL)
4491 return split_whitespace(self,list,maxcount);
4492
4493 else if (substring->length == 1)
4494 return split_char(self,list,substring->str[0],maxcount);
4495
4496 else if (substring->length == 0) {
4497 Py_DECREF(list);
4498 PyErr_SetString(PyExc_ValueError, "empty separator");
4499 return NULL;
4500 }
4501 else
4502 return split_substring(self,list,substring,maxcount);
4503}
4504
Tim Petersced69f82003-09-16 20:30:58 +00004505static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004506PyObject *rsplit(PyUnicodeObject *self,
4507 PyUnicodeObject *substring,
4508 int maxcount)
4509{
4510 PyObject *list;
4511
4512 if (maxcount < 0)
4513 maxcount = INT_MAX;
4514
4515 list = PyList_New(0);
4516 if (!list)
4517 return NULL;
4518
4519 if (substring == NULL)
4520 return rsplit_whitespace(self,list,maxcount);
4521
4522 else if (substring->length == 1)
4523 return rsplit_char(self,list,substring->str[0],maxcount);
4524
4525 else if (substring->length == 0) {
4526 Py_DECREF(list);
4527 PyErr_SetString(PyExc_ValueError, "empty separator");
4528 return NULL;
4529 }
4530 else
4531 return rsplit_substring(self,list,substring,maxcount);
4532}
4533
4534static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004535PyObject *replace(PyUnicodeObject *self,
4536 PyUnicodeObject *str1,
4537 PyUnicodeObject *str2,
4538 int maxcount)
4539{
4540 PyUnicodeObject *u;
4541
4542 if (maxcount < 0)
4543 maxcount = INT_MAX;
4544
4545 if (str1->length == 1 && str2->length == 1) {
4546 int i;
4547
4548 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004549 if (!findchar(self->str, self->length, str1->str[0]) &&
4550 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004551 /* nothing to replace, return original string */
4552 Py_INCREF(self);
4553 u = self;
4554 } else {
4555 Py_UNICODE u1 = str1->str[0];
4556 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004557
Guido van Rossumd57fd912000-03-10 22:53:23 +00004558 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004559 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560 self->length
4561 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004562 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004563 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004564 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004565 for (i = 0; i < u->length; i++)
4566 if (u->str[i] == u1) {
4567 if (--maxcount < 0)
4568 break;
4569 u->str[i] = u2;
4570 }
4571 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004572 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573
4574 } else {
4575 int n, i;
4576 Py_UNICODE *p;
4577
4578 /* replace strings */
4579 n = count(self, 0, self->length, str1);
4580 if (n > maxcount)
4581 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004582 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004583 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004584 if (PyUnicode_CheckExact(self)) {
4585 Py_INCREF(self);
4586 u = self;
4587 }
4588 else {
4589 u = (PyUnicodeObject *)
4590 PyUnicode_FromUnicode(self->str, self->length);
4591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592 } else {
4593 u = _PyUnicode_New(
4594 self->length + n * (str2->length - str1->length));
4595 if (u) {
4596 i = 0;
4597 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004598 if (str1->length > 0) {
4599 while (i <= self->length - str1->length)
4600 if (Py_UNICODE_MATCH(self, i, str1)) {
4601 /* replace string segment */
4602 Py_UNICODE_COPY(p, str2->str, str2->length);
4603 p += str2->length;
4604 i += str1->length;
4605 if (--n <= 0) {
4606 /* copy remaining part */
4607 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4608 break;
4609 }
4610 } else
4611 *p++ = self->str[i++];
4612 } else {
4613 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004614 Py_UNICODE_COPY(p, str2->str, str2->length);
4615 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004616 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004617 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004619 }
4620 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4621 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622 }
4623 }
4624 }
Tim Petersced69f82003-09-16 20:30:58 +00004625
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626 return (PyObject *) u;
4627}
4628
4629/* --- Unicode Object Methods --------------------------------------------- */
4630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004631PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004632"S.title() -> unicode\n\
4633\n\
4634Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004635characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004636
4637static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004638unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004639{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004640 return fixup(self, fixtitle);
4641}
4642
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004643PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004644"S.capitalize() -> unicode\n\
4645\n\
4646Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004647have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004648
4649static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004650unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004651{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004652 return fixup(self, fixcapitalize);
4653}
4654
4655#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004656PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657"S.capwords() -> unicode\n\
4658\n\
4659Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004660normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004661
4662static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004663unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664{
4665 PyObject *list;
4666 PyObject *item;
4667 int i;
4668
Guido van Rossumd57fd912000-03-10 22:53:23 +00004669 /* Split into words */
4670 list = split(self, NULL, -1);
4671 if (!list)
4672 return NULL;
4673
4674 /* Capitalize each word */
4675 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4676 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4677 fixcapitalize);
4678 if (item == NULL)
4679 goto onError;
4680 Py_DECREF(PyList_GET_ITEM(list, i));
4681 PyList_SET_ITEM(list, i, item);
4682 }
4683
4684 /* Join the words to form a new string */
4685 item = PyUnicode_Join(NULL, list);
4686
4687onError:
4688 Py_DECREF(list);
4689 return (PyObject *)item;
4690}
4691#endif
4692
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004693/* Argument converter. Coerces to a single unicode character */
4694
4695static int
4696convert_uc(PyObject *obj, void *addr)
4697{
4698 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4699 PyObject *uniobj;
4700 Py_UNICODE *unistr;
4701
4702 uniobj = PyUnicode_FromObject(obj);
4703 if (uniobj == NULL) {
4704 PyErr_SetString(PyExc_TypeError,
4705 "The fill character cannot be converted to Unicode");
4706 return 0;
4707 }
4708 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4709 PyErr_SetString(PyExc_TypeError,
4710 "The fill character must be exactly one character long");
4711 Py_DECREF(uniobj);
4712 return 0;
4713 }
4714 unistr = PyUnicode_AS_UNICODE(uniobj);
4715 *fillcharloc = unistr[0];
4716 Py_DECREF(uniobj);
4717 return 1;
4718}
4719
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004720PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004721"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004723Return S centered in a Unicode string of length width. Padding is\n\
4724done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725
4726static PyObject *
4727unicode_center(PyUnicodeObject *self, PyObject *args)
4728{
4729 int marg, left;
4730 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004731 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004733 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734 return NULL;
4735
Tim Peters7a29bd52001-09-12 03:03:31 +00004736 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737 Py_INCREF(self);
4738 return (PyObject*) self;
4739 }
4740
4741 marg = width - self->length;
4742 left = marg / 2 + (marg & width & 1);
4743
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004744 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745}
4746
Marc-André Lemburge5034372000-08-08 08:04:29 +00004747#if 0
4748
4749/* This code should go into some future Unicode collation support
4750 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004751 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004752
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004753/* speedy UTF-16 code point order comparison */
4754/* gleaned from: */
4755/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4756
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004757static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004758{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004759 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004760 0, 0, 0, 0, 0, 0, 0, 0,
4761 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004762 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004763};
4764
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765static int
4766unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4767{
4768 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004769
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770 Py_UNICODE *s1 = str1->str;
4771 Py_UNICODE *s2 = str2->str;
4772
4773 len1 = str1->length;
4774 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004775
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004777 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004778
4779 c1 = *s1++;
4780 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004781
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004782 if (c1 > (1<<11) * 26)
4783 c1 += utf16Fixup[c1>>11];
4784 if (c2 > (1<<11) * 26)
4785 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004786 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004787
4788 if (c1 != c2)
4789 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004790
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004791 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792 }
4793
4794 return (len1 < len2) ? -1 : (len1 != len2);
4795}
4796
Marc-André Lemburge5034372000-08-08 08:04:29 +00004797#else
4798
4799static int
4800unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4801{
4802 register int len1, len2;
4803
4804 Py_UNICODE *s1 = str1->str;
4805 Py_UNICODE *s2 = str2->str;
4806
4807 len1 = str1->length;
4808 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004809
Marc-André Lemburge5034372000-08-08 08:04:29 +00004810 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004811 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004812
Fredrik Lundh45714e92001-06-26 16:39:36 +00004813 c1 = *s1++;
4814 c2 = *s2++;
4815
4816 if (c1 != c2)
4817 return (c1 < c2) ? -1 : 1;
4818
Marc-André Lemburge5034372000-08-08 08:04:29 +00004819 len1--; len2--;
4820 }
4821
4822 return (len1 < len2) ? -1 : (len1 != len2);
4823}
4824
4825#endif
4826
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827int PyUnicode_Compare(PyObject *left,
4828 PyObject *right)
4829{
4830 PyUnicodeObject *u = NULL, *v = NULL;
4831 int result;
4832
4833 /* Coerce the two arguments */
4834 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4835 if (u == NULL)
4836 goto onError;
4837 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4838 if (v == NULL)
4839 goto onError;
4840
Thomas Wouters7e474022000-07-16 12:04:32 +00004841 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 if (v == u) {
4843 Py_DECREF(u);
4844 Py_DECREF(v);
4845 return 0;
4846 }
4847
4848 result = unicode_compare(u, v);
4849
4850 Py_DECREF(u);
4851 Py_DECREF(v);
4852 return result;
4853
4854onError:
4855 Py_XDECREF(u);
4856 Py_XDECREF(v);
4857 return -1;
4858}
4859
Guido van Rossum403d68b2000-03-13 15:55:09 +00004860int PyUnicode_Contains(PyObject *container,
4861 PyObject *element)
4862{
4863 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004864 int result, size;
4865 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004866
4867 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004868 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004869 if (v == NULL) {
4870 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004871 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004872 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004873 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004874 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004875 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004876 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004877
Barry Warsaw817918c2002-08-06 16:58:21 +00004878 size = PyUnicode_GET_SIZE(v);
4879 rhs = PyUnicode_AS_UNICODE(v);
4880 lhs = PyUnicode_AS_UNICODE(u);
4881
Guido van Rossum403d68b2000-03-13 15:55:09 +00004882 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004883 if (size == 1) {
4884 end = lhs + PyUnicode_GET_SIZE(u);
4885 while (lhs < end) {
4886 if (*lhs++ == *rhs) {
4887 result = 1;
4888 break;
4889 }
4890 }
4891 }
4892 else {
4893 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4894 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004895 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004896 result = 1;
4897 break;
4898 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004899 }
4900 }
4901
4902 Py_DECREF(u);
4903 Py_DECREF(v);
4904 return result;
4905
4906onError:
4907 Py_XDECREF(u);
4908 Py_XDECREF(v);
4909 return -1;
4910}
4911
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912/* Concat to string or Unicode object giving a new Unicode object. */
4913
4914PyObject *PyUnicode_Concat(PyObject *left,
4915 PyObject *right)
4916{
4917 PyUnicodeObject *u = NULL, *v = NULL, *w;
4918
4919 /* Coerce the two arguments */
4920 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4921 if (u == NULL)
4922 goto onError;
4923 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4924 if (v == NULL)
4925 goto onError;
4926
4927 /* Shortcuts */
4928 if (v == unicode_empty) {
4929 Py_DECREF(v);
4930 return (PyObject *)u;
4931 }
4932 if (u == unicode_empty) {
4933 Py_DECREF(u);
4934 return (PyObject *)v;
4935 }
4936
4937 /* Concat the two Unicode strings */
4938 w = _PyUnicode_New(u->length + v->length);
4939 if (w == NULL)
4940 goto onError;
4941 Py_UNICODE_COPY(w->str, u->str, u->length);
4942 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4943
4944 Py_DECREF(u);
4945 Py_DECREF(v);
4946 return (PyObject *)w;
4947
4948onError:
4949 Py_XDECREF(u);
4950 Py_XDECREF(v);
4951 return NULL;
4952}
4953
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004954PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955"S.count(sub[, start[, end]]) -> int\n\
4956\n\
4957Return the number of occurrences of substring sub in Unicode string\n\
4958S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004959interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004960
4961static PyObject *
4962unicode_count(PyUnicodeObject *self, PyObject *args)
4963{
4964 PyUnicodeObject *substring;
4965 int start = 0;
4966 int end = INT_MAX;
4967 PyObject *result;
4968
Guido van Rossumb8872e62000-05-09 14:14:27 +00004969 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4970 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004971 return NULL;
4972
4973 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4974 (PyObject *)substring);
4975 if (substring == NULL)
4976 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004977
Guido van Rossumd57fd912000-03-10 22:53:23 +00004978 if (start < 0)
4979 start += self->length;
4980 if (start < 0)
4981 start = 0;
4982 if (end > self->length)
4983 end = self->length;
4984 if (end < 0)
4985 end += self->length;
4986 if (end < 0)
4987 end = 0;
4988
4989 result = PyInt_FromLong((long) count(self, start, end, substring));
4990
4991 Py_DECREF(substring);
4992 return result;
4993}
4994
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004995PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004996"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004997\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004998Encodes S using the codec registered for encoding. encoding defaults\n\
4999to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005000handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005001a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5002'xmlcharrefreplace' as well as any other name registered with\n\
5003codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005004
5005static PyObject *
5006unicode_encode(PyUnicodeObject *self, PyObject *args)
5007{
5008 char *encoding = NULL;
5009 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005010 PyObject *v;
5011
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5013 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005014 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005015 if (v == NULL)
5016 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005017 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5018 PyErr_Format(PyExc_TypeError,
5019 "encoder did not return a string/unicode object "
5020 "(type=%.400s)",
5021 v->ob_type->tp_name);
5022 Py_DECREF(v);
5023 return NULL;
5024 }
5025 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005026
5027 onError:
5028 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005029}
5030
5031PyDoc_STRVAR(decode__doc__,
5032"S.decode([encoding[,errors]]) -> string or unicode\n\
5033\n\
5034Decodes S using the codec registered for encoding. encoding defaults\n\
5035to the default encoding. errors may be given to set a different error\n\
5036handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5037a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5038as well as any other name registerd with codecs.register_error that is\n\
5039able to handle UnicodeDecodeErrors.");
5040
5041static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005042unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005043{
5044 char *encoding = NULL;
5045 char *errors = NULL;
5046 PyObject *v;
5047
5048 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5049 return NULL;
5050 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005051 if (v == NULL)
5052 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005053 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5054 PyErr_Format(PyExc_TypeError,
5055 "decoder did not return a string/unicode object "
5056 "(type=%.400s)",
5057 v->ob_type->tp_name);
5058 Py_DECREF(v);
5059 return NULL;
5060 }
5061 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005062
5063 onError:
5064 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065}
5066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005067PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068"S.expandtabs([tabsize]) -> unicode\n\
5069\n\
5070Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005071If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072
5073static PyObject*
5074unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5075{
5076 Py_UNICODE *e;
5077 Py_UNICODE *p;
5078 Py_UNICODE *q;
5079 int i, j;
5080 PyUnicodeObject *u;
5081 int tabsize = 8;
5082
5083 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5084 return NULL;
5085
Thomas Wouters7e474022000-07-16 12:04:32 +00005086 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005087 i = j = 0;
5088 e = self->str + self->length;
5089 for (p = self->str; p < e; p++)
5090 if (*p == '\t') {
5091 if (tabsize > 0)
5092 j += tabsize - (j % tabsize);
5093 }
5094 else {
5095 j++;
5096 if (*p == '\n' || *p == '\r') {
5097 i += j;
5098 j = 0;
5099 }
5100 }
5101
5102 /* Second pass: create output string and fill it */
5103 u = _PyUnicode_New(i + j);
5104 if (!u)
5105 return NULL;
5106
5107 j = 0;
5108 q = u->str;
5109
5110 for (p = self->str; p < e; p++)
5111 if (*p == '\t') {
5112 if (tabsize > 0) {
5113 i = tabsize - (j % tabsize);
5114 j += i;
5115 while (i--)
5116 *q++ = ' ';
5117 }
5118 }
5119 else {
5120 j++;
5121 *q++ = *p;
5122 if (*p == '\n' || *p == '\r')
5123 j = 0;
5124 }
5125
5126 return (PyObject*) u;
5127}
5128
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005129PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130"S.find(sub [,start [,end]]) -> int\n\
5131\n\
5132Return the lowest index in S where substring sub is found,\n\
5133such that sub is contained within s[start,end]. Optional\n\
5134arguments start and end are interpreted as in slice notation.\n\
5135\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005136Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137
5138static PyObject *
5139unicode_find(PyUnicodeObject *self, PyObject *args)
5140{
5141 PyUnicodeObject *substring;
5142 int start = 0;
5143 int end = INT_MAX;
5144 PyObject *result;
5145
Guido van Rossumb8872e62000-05-09 14:14:27 +00005146 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5147 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148 return NULL;
5149 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5150 (PyObject *)substring);
5151 if (substring == NULL)
5152 return NULL;
5153
5154 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
5155
5156 Py_DECREF(substring);
5157 return result;
5158}
5159
5160static PyObject *
5161unicode_getitem(PyUnicodeObject *self, int index)
5162{
5163 if (index < 0 || index >= self->length) {
5164 PyErr_SetString(PyExc_IndexError, "string index out of range");
5165 return NULL;
5166 }
5167
5168 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5169}
5170
5171static long
5172unicode_hash(PyUnicodeObject *self)
5173{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005174 /* Since Unicode objects compare equal to their ASCII string
5175 counterparts, they should use the individual character values
5176 as basis for their hash value. This is needed to assure that
5177 strings and Unicode objects behave in the same way as
5178 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179
Fredrik Lundhdde61642000-07-10 18:27:47 +00005180 register int len;
5181 register Py_UNICODE *p;
5182 register long x;
5183
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 if (self->hash != -1)
5185 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005186 len = PyUnicode_GET_SIZE(self);
5187 p = PyUnicode_AS_UNICODE(self);
5188 x = *p << 7;
5189 while (--len >= 0)
5190 x = (1000003*x) ^ *p++;
5191 x ^= PyUnicode_GET_SIZE(self);
5192 if (x == -1)
5193 x = -2;
5194 self->hash = x;
5195 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196}
5197
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005198PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199"S.index(sub [,start [,end]]) -> int\n\
5200\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005201Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202
5203static PyObject *
5204unicode_index(PyUnicodeObject *self, PyObject *args)
5205{
5206 int result;
5207 PyUnicodeObject *substring;
5208 int start = 0;
5209 int end = INT_MAX;
5210
Guido van Rossumb8872e62000-05-09 14:14:27 +00005211 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5212 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005214
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5216 (PyObject *)substring);
5217 if (substring == NULL)
5218 return NULL;
5219
5220 result = findstring(self, substring, start, end, 1);
5221
5222 Py_DECREF(substring);
5223 if (result < 0) {
5224 PyErr_SetString(PyExc_ValueError, "substring not found");
5225 return NULL;
5226 }
5227 return PyInt_FromLong(result);
5228}
5229
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005230PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005231"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005233Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005234at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235
5236static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005237unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238{
5239 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5240 register const Py_UNICODE *e;
5241 int cased;
5242
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243 /* Shortcut for single character strings */
5244 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005245 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005247 /* Special case for empty strings */
5248 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005249 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005250
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251 e = p + PyUnicode_GET_SIZE(self);
5252 cased = 0;
5253 for (; p < e; p++) {
5254 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005255
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005257 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258 else if (!cased && Py_UNICODE_ISLOWER(ch))
5259 cased = 1;
5260 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005261 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262}
5263
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005264PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005265"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005267Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005268at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269
5270static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005271unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272{
5273 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5274 register const Py_UNICODE *e;
5275 int cased;
5276
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277 /* Shortcut for single character strings */
5278 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005279 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005281 /* Special case for empty strings */
5282 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005283 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005284
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285 e = p + PyUnicode_GET_SIZE(self);
5286 cased = 0;
5287 for (; p < e; p++) {
5288 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005289
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005291 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 else if (!cased && Py_UNICODE_ISUPPER(ch))
5293 cased = 1;
5294 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005295 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296}
5297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005298PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005299"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005301Return True if S is a titlecased string and there is at least one\n\
5302character in S, i.e. upper- and titlecase characters may only\n\
5303follow uncased characters and lowercase characters only cased ones.\n\
5304Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305
5306static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005307unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308{
5309 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5310 register const Py_UNICODE *e;
5311 int cased, previous_is_cased;
5312
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 /* Shortcut for single character strings */
5314 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005315 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5316 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005318 /* Special case for empty strings */
5319 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005320 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005321
Guido van Rossumd57fd912000-03-10 22:53:23 +00005322 e = p + PyUnicode_GET_SIZE(self);
5323 cased = 0;
5324 previous_is_cased = 0;
5325 for (; p < e; p++) {
5326 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005327
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5329 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005330 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 previous_is_cased = 1;
5332 cased = 1;
5333 }
5334 else if (Py_UNICODE_ISLOWER(ch)) {
5335 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005336 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337 previous_is_cased = 1;
5338 cased = 1;
5339 }
5340 else
5341 previous_is_cased = 0;
5342 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005343 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344}
5345
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005346PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005347"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005349Return True if all characters in S are whitespace\n\
5350and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351
5352static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005353unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354{
5355 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5356 register const Py_UNICODE *e;
5357
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 /* Shortcut for single character strings */
5359 if (PyUnicode_GET_SIZE(self) == 1 &&
5360 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005361 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005363 /* Special case for empty strings */
5364 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005365 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005366
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367 e = p + PyUnicode_GET_SIZE(self);
5368 for (; p < e; p++) {
5369 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005370 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005372 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373}
5374
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005375PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005376"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005377\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005378Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005379and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005380
5381static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005382unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005383{
5384 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5385 register const Py_UNICODE *e;
5386
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005387 /* Shortcut for single character strings */
5388 if (PyUnicode_GET_SIZE(self) == 1 &&
5389 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005390 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005391
5392 /* Special case for empty strings */
5393 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005394 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005395
5396 e = p + PyUnicode_GET_SIZE(self);
5397 for (; p < e; p++) {
5398 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005399 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005400 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005401 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005402}
5403
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005404PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005405"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005406\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005407Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005408and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005409
5410static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005411unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005412{
5413 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5414 register const Py_UNICODE *e;
5415
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005416 /* Shortcut for single character strings */
5417 if (PyUnicode_GET_SIZE(self) == 1 &&
5418 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005419 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005420
5421 /* Special case for empty strings */
5422 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005423 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005424
5425 e = p + PyUnicode_GET_SIZE(self);
5426 for (; p < e; p++) {
5427 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005428 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005429 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005430 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005431}
5432
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005433PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005434"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005436Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005437False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438
5439static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005440unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441{
5442 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5443 register const Py_UNICODE *e;
5444
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 /* Shortcut for single character strings */
5446 if (PyUnicode_GET_SIZE(self) == 1 &&
5447 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005448 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005450 /* Special case for empty strings */
5451 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005452 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005453
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 e = p + PyUnicode_GET_SIZE(self);
5455 for (; p < e; p++) {
5456 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005457 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005459 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460}
5461
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005462PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005463"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005465Return True if all characters in S are digits\n\
5466and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467
5468static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005469unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470{
5471 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5472 register const Py_UNICODE *e;
5473
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 /* Shortcut for single character strings */
5475 if (PyUnicode_GET_SIZE(self) == 1 &&
5476 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005477 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005479 /* Special case for empty strings */
5480 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005481 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005482
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483 e = p + PyUnicode_GET_SIZE(self);
5484 for (; p < e; p++) {
5485 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005486 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005488 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489}
5490
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005491PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005492"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005494Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005495False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496
5497static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005498unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499{
5500 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5501 register const Py_UNICODE *e;
5502
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 /* Shortcut for single character strings */
5504 if (PyUnicode_GET_SIZE(self) == 1 &&
5505 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005506 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005508 /* Special case for empty strings */
5509 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005510 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005511
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512 e = p + PyUnicode_GET_SIZE(self);
5513 for (; p < e; p++) {
5514 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005515 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005517 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518}
5519
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005520PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521"S.join(sequence) -> unicode\n\
5522\n\
5523Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005524sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525
5526static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005527unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005529 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530}
5531
5532static int
5533unicode_length(PyUnicodeObject *self)
5534{
5535 return self->length;
5536}
5537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005538PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005539"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540\n\
5541Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005542done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543
5544static PyObject *
5545unicode_ljust(PyUnicodeObject *self, PyObject *args)
5546{
5547 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005548 Py_UNICODE fillchar = ' ';
5549
5550 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551 return NULL;
5552
Tim Peters7a29bd52001-09-12 03:03:31 +00005553 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 Py_INCREF(self);
5555 return (PyObject*) self;
5556 }
5557
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005558 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559}
5560
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005561PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562"S.lower() -> unicode\n\
5563\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005564Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565
5566static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005567unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 return fixup(self, fixlower);
5570}
5571
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005572#define LEFTSTRIP 0
5573#define RIGHTSTRIP 1
5574#define BOTHSTRIP 2
5575
5576/* Arrays indexed by above */
5577static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5578
5579#define STRIPNAME(i) (stripformat[i]+3)
5580
5581static const Py_UNICODE *
5582unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5583{
Tim Peters030a5ce2002-04-22 19:00:10 +00005584 size_t i;
5585 for (i = 0; i < n; ++i)
5586 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005587 return s+i;
5588 return NULL;
5589}
5590
5591/* externally visible for str.strip(unicode) */
5592PyObject *
5593_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5594{
5595 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5596 int len = PyUnicode_GET_SIZE(self);
5597 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5598 int seplen = PyUnicode_GET_SIZE(sepobj);
5599 int i, j;
5600
5601 i = 0;
5602 if (striptype != RIGHTSTRIP) {
5603 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5604 i++;
5605 }
5606 }
5607
5608 j = len;
5609 if (striptype != LEFTSTRIP) {
5610 do {
5611 j--;
5612 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5613 j++;
5614 }
5615
5616 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5617 Py_INCREF(self);
5618 return (PyObject*)self;
5619 }
5620 else
5621 return PyUnicode_FromUnicode(s+i, j-i);
5622}
5623
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624
5625static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005626do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005628 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5629 int len = PyUnicode_GET_SIZE(self), i, j;
5630
5631 i = 0;
5632 if (striptype != RIGHTSTRIP) {
5633 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5634 i++;
5635 }
5636 }
5637
5638 j = len;
5639 if (striptype != LEFTSTRIP) {
5640 do {
5641 j--;
5642 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5643 j++;
5644 }
5645
5646 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5647 Py_INCREF(self);
5648 return (PyObject*)self;
5649 }
5650 else
5651 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652}
5653
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005654
5655static PyObject *
5656do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5657{
5658 PyObject *sep = NULL;
5659
5660 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5661 return NULL;
5662
5663 if (sep != NULL && sep != Py_None) {
5664 if (PyUnicode_Check(sep))
5665 return _PyUnicode_XStrip(self, striptype, sep);
5666 else if (PyString_Check(sep)) {
5667 PyObject *res;
5668 sep = PyUnicode_FromObject(sep);
5669 if (sep==NULL)
5670 return NULL;
5671 res = _PyUnicode_XStrip(self, striptype, sep);
5672 Py_DECREF(sep);
5673 return res;
5674 }
5675 else {
5676 PyErr_Format(PyExc_TypeError,
5677 "%s arg must be None, unicode or str",
5678 STRIPNAME(striptype));
5679 return NULL;
5680 }
5681 }
5682
5683 return do_strip(self, striptype);
5684}
5685
5686
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005687PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005688"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005689\n\
5690Return a copy of the string S with leading and trailing\n\
5691whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005692If chars is given and not None, remove characters in chars instead.\n\
5693If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005694
5695static PyObject *
5696unicode_strip(PyUnicodeObject *self, PyObject *args)
5697{
5698 if (PyTuple_GET_SIZE(args) == 0)
5699 return do_strip(self, BOTHSTRIP); /* Common case */
5700 else
5701 return do_argstrip(self, BOTHSTRIP, args);
5702}
5703
5704
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005705PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005706"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005707\n\
5708Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005709If chars is given and not None, remove characters in chars instead.\n\
5710If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005711
5712static PyObject *
5713unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5714{
5715 if (PyTuple_GET_SIZE(args) == 0)
5716 return do_strip(self, LEFTSTRIP); /* Common case */
5717 else
5718 return do_argstrip(self, LEFTSTRIP, args);
5719}
5720
5721
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005722PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005723"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005724\n\
5725Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005726If chars is given and not None, remove characters in chars instead.\n\
5727If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005728
5729static PyObject *
5730unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5731{
5732 if (PyTuple_GET_SIZE(args) == 0)
5733 return do_strip(self, RIGHTSTRIP); /* Common case */
5734 else
5735 return do_argstrip(self, RIGHTSTRIP, args);
5736}
5737
5738
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739static PyObject*
5740unicode_repeat(PyUnicodeObject *str, int len)
5741{
5742 PyUnicodeObject *u;
5743 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005744 int nchars;
5745 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746
5747 if (len < 0)
5748 len = 0;
5749
Tim Peters7a29bd52001-09-12 03:03:31 +00005750 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 /* no repeat, return original string */
5752 Py_INCREF(str);
5753 return (PyObject*) str;
5754 }
Tim Peters8f422462000-09-09 06:13:41 +00005755
5756 /* ensure # of chars needed doesn't overflow int and # of bytes
5757 * needed doesn't overflow size_t
5758 */
5759 nchars = len * str->length;
5760 if (len && nchars / len != str->length) {
5761 PyErr_SetString(PyExc_OverflowError,
5762 "repeated string is too long");
5763 return NULL;
5764 }
5765 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5766 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5767 PyErr_SetString(PyExc_OverflowError,
5768 "repeated string is too long");
5769 return NULL;
5770 }
5771 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 if (!u)
5773 return NULL;
5774
5775 p = u->str;
5776
5777 while (len-- > 0) {
5778 Py_UNICODE_COPY(p, str->str, str->length);
5779 p += str->length;
5780 }
5781
5782 return (PyObject*) u;
5783}
5784
5785PyObject *PyUnicode_Replace(PyObject *obj,
5786 PyObject *subobj,
5787 PyObject *replobj,
5788 int maxcount)
5789{
5790 PyObject *self;
5791 PyObject *str1;
5792 PyObject *str2;
5793 PyObject *result;
5794
5795 self = PyUnicode_FromObject(obj);
5796 if (self == NULL)
5797 return NULL;
5798 str1 = PyUnicode_FromObject(subobj);
5799 if (str1 == NULL) {
5800 Py_DECREF(self);
5801 return NULL;
5802 }
5803 str2 = PyUnicode_FromObject(replobj);
5804 if (str2 == NULL) {
5805 Py_DECREF(self);
5806 Py_DECREF(str1);
5807 return NULL;
5808 }
Tim Petersced69f82003-09-16 20:30:58 +00005809 result = replace((PyUnicodeObject *)self,
5810 (PyUnicodeObject *)str1,
5811 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812 maxcount);
5813 Py_DECREF(self);
5814 Py_DECREF(str1);
5815 Py_DECREF(str2);
5816 return result;
5817}
5818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005819PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820"S.replace (old, new[, maxsplit]) -> unicode\n\
5821\n\
5822Return a copy of S with all occurrences of substring\n\
5823old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005824given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825
5826static PyObject*
5827unicode_replace(PyUnicodeObject *self, PyObject *args)
5828{
5829 PyUnicodeObject *str1;
5830 PyUnicodeObject *str2;
5831 int maxcount = -1;
5832 PyObject *result;
5833
5834 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5835 return NULL;
5836 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5837 if (str1 == NULL)
5838 return NULL;
5839 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005840 if (str2 == NULL) {
5841 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005843 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844
5845 result = replace(self, str1, str2, maxcount);
5846
5847 Py_DECREF(str1);
5848 Py_DECREF(str2);
5849 return result;
5850}
5851
5852static
5853PyObject *unicode_repr(PyObject *unicode)
5854{
5855 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5856 PyUnicode_GET_SIZE(unicode),
5857 1);
5858}
5859
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005860PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861"S.rfind(sub [,start [,end]]) -> int\n\
5862\n\
5863Return the highest index in S where substring sub is found,\n\
5864such that sub is contained within s[start,end]. Optional\n\
5865arguments start and end are interpreted as in slice notation.\n\
5866\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005867Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868
5869static PyObject *
5870unicode_rfind(PyUnicodeObject *self, PyObject *args)
5871{
5872 PyUnicodeObject *substring;
5873 int start = 0;
5874 int end = INT_MAX;
5875 PyObject *result;
5876
Guido van Rossumb8872e62000-05-09 14:14:27 +00005877 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5878 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879 return NULL;
5880 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5881 (PyObject *)substring);
5882 if (substring == NULL)
5883 return NULL;
5884
5885 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5886
5887 Py_DECREF(substring);
5888 return result;
5889}
5890
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005891PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892"S.rindex(sub [,start [,end]]) -> int\n\
5893\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005894Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895
5896static PyObject *
5897unicode_rindex(PyUnicodeObject *self, PyObject *args)
5898{
5899 int result;
5900 PyUnicodeObject *substring;
5901 int start = 0;
5902 int end = INT_MAX;
5903
Guido van Rossumb8872e62000-05-09 14:14:27 +00005904 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5905 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 return NULL;
5907 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5908 (PyObject *)substring);
5909 if (substring == NULL)
5910 return NULL;
5911
5912 result = findstring(self, substring, start, end, -1);
5913
5914 Py_DECREF(substring);
5915 if (result < 0) {
5916 PyErr_SetString(PyExc_ValueError, "substring not found");
5917 return NULL;
5918 }
5919 return PyInt_FromLong(result);
5920}
5921
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005922PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005923"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924\n\
5925Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005926done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927
5928static PyObject *
5929unicode_rjust(PyUnicodeObject *self, PyObject *args)
5930{
5931 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005932 Py_UNICODE fillchar = ' ';
5933
5934 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 return NULL;
5936
Tim Peters7a29bd52001-09-12 03:03:31 +00005937 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 Py_INCREF(self);
5939 return (PyObject*) self;
5940 }
5941
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005942 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943}
5944
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945static PyObject*
5946unicode_slice(PyUnicodeObject *self, int start, int end)
5947{
5948 /* standard clamping */
5949 if (start < 0)
5950 start = 0;
5951 if (end < 0)
5952 end = 0;
5953 if (end > self->length)
5954 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005955 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 /* full slice, return original string */
5957 Py_INCREF(self);
5958 return (PyObject*) self;
5959 }
5960 if (start > end)
5961 start = end;
5962 /* copy slice */
5963 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5964 end - start);
5965}
5966
5967PyObject *PyUnicode_Split(PyObject *s,
5968 PyObject *sep,
5969 int maxsplit)
5970{
5971 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005972
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 s = PyUnicode_FromObject(s);
5974 if (s == NULL)
5975 return NULL;
5976 if (sep != NULL) {
5977 sep = PyUnicode_FromObject(sep);
5978 if (sep == NULL) {
5979 Py_DECREF(s);
5980 return NULL;
5981 }
5982 }
5983
5984 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5985
5986 Py_DECREF(s);
5987 Py_XDECREF(sep);
5988 return result;
5989}
5990
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005991PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992"S.split([sep [,maxsplit]]) -> list of strings\n\
5993\n\
5994Return a list of the words in S, using sep as the\n\
5995delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00005996splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00005997any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998
5999static PyObject*
6000unicode_split(PyUnicodeObject *self, PyObject *args)
6001{
6002 PyObject *substring = Py_None;
6003 int maxcount = -1;
6004
6005 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
6006 return NULL;
6007
6008 if (substring == Py_None)
6009 return split(self, NULL, maxcount);
6010 else if (PyUnicode_Check(substring))
6011 return split(self, (PyUnicodeObject *)substring, maxcount);
6012 else
6013 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6014}
6015
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006016PyObject *PyUnicode_RSplit(PyObject *s,
6017 PyObject *sep,
6018 int maxsplit)
6019{
6020 PyObject *result;
6021
6022 s = PyUnicode_FromObject(s);
6023 if (s == NULL)
6024 return NULL;
6025 if (sep != NULL) {
6026 sep = PyUnicode_FromObject(sep);
6027 if (sep == NULL) {
6028 Py_DECREF(s);
6029 return NULL;
6030 }
6031 }
6032
6033 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6034
6035 Py_DECREF(s);
6036 Py_XDECREF(sep);
6037 return result;
6038}
6039
6040PyDoc_STRVAR(rsplit__doc__,
6041"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6042\n\
6043Return a list of the words in S, using sep as the\n\
6044delimiter string, starting at the end of the string and\n\
6045working to the front. If maxsplit is given, at most maxsplit\n\
6046splits are done. If sep is not specified, any whitespace string\n\
6047is a separator.");
6048
6049static PyObject*
6050unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6051{
6052 PyObject *substring = Py_None;
6053 int maxcount = -1;
6054
6055 if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
6056 return NULL;
6057
6058 if (substring == Py_None)
6059 return rsplit(self, NULL, maxcount);
6060 else if (PyUnicode_Check(substring))
6061 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6062 else
6063 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6064}
6065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006066PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006067"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068\n\
6069Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006070Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006071is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072
6073static PyObject*
6074unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6075{
Guido van Rossum86662912000-04-11 15:38:46 +00006076 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077
Guido van Rossum86662912000-04-11 15:38:46 +00006078 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 return NULL;
6080
Guido van Rossum86662912000-04-11 15:38:46 +00006081 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082}
6083
6084static
6085PyObject *unicode_str(PyUnicodeObject *self)
6086{
Fred Drakee4315f52000-05-09 19:53:39 +00006087 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088}
6089
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006090PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091"S.swapcase() -> unicode\n\
6092\n\
6093Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006094and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095
6096static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006097unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 return fixup(self, fixswapcase);
6100}
6101
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006102PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103"S.translate(table) -> unicode\n\
6104\n\
6105Return a copy of the string S, where all characters have been mapped\n\
6106through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006107Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6108Unmapped characters are left untouched. Characters mapped to None\n\
6109are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110
6111static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006112unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113{
Tim Petersced69f82003-09-16 20:30:58 +00006114 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006116 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 "ignore");
6118}
6119
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006120PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121"S.upper() -> unicode\n\
6122\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006123Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124
6125static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006126unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 return fixup(self, fixupper);
6129}
6130
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006131PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132"S.zfill(width) -> unicode\n\
6133\n\
6134Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006135of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136
6137static PyObject *
6138unicode_zfill(PyUnicodeObject *self, PyObject *args)
6139{
6140 int fill;
6141 PyUnicodeObject *u;
6142
6143 int width;
6144 if (!PyArg_ParseTuple(args, "i:zfill", &width))
6145 return NULL;
6146
6147 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006148 if (PyUnicode_CheckExact(self)) {
6149 Py_INCREF(self);
6150 return (PyObject*) self;
6151 }
6152 else
6153 return PyUnicode_FromUnicode(
6154 PyUnicode_AS_UNICODE(self),
6155 PyUnicode_GET_SIZE(self)
6156 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 }
6158
6159 fill = width - self->length;
6160
6161 u = pad(self, fill, 0, '0');
6162
Walter Dörwald068325e2002-04-15 13:36:47 +00006163 if (u == NULL)
6164 return NULL;
6165
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166 if (u->str[fill] == '+' || u->str[fill] == '-') {
6167 /* move sign to beginning of string */
6168 u->str[0] = u->str[fill];
6169 u->str[fill] = '0';
6170 }
6171
6172 return (PyObject*) u;
6173}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174
6175#if 0
6176static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006177unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 return PyInt_FromLong(unicode_freelist_size);
6180}
6181#endif
6182
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006183PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006184"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006186Return True if S starts with the specified prefix, False otherwise.\n\
6187With optional start, test S beginning at that position.\n\
6188With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189
6190static PyObject *
6191unicode_startswith(PyUnicodeObject *self,
6192 PyObject *args)
6193{
6194 PyUnicodeObject *substring;
6195 int start = 0;
6196 int end = INT_MAX;
6197 PyObject *result;
6198
Guido van Rossumb8872e62000-05-09 14:14:27 +00006199 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6200 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 return NULL;
6202 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6203 (PyObject *)substring);
6204 if (substring == NULL)
6205 return NULL;
6206
Guido van Rossum77f6a652002-04-03 22:41:51 +00006207 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208
6209 Py_DECREF(substring);
6210 return result;
6211}
6212
6213
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006214PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006215"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006217Return True if S ends with the specified suffix, False otherwise.\n\
6218With optional start, test S beginning at that position.\n\
6219With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220
6221static PyObject *
6222unicode_endswith(PyUnicodeObject *self,
6223 PyObject *args)
6224{
6225 PyUnicodeObject *substring;
6226 int start = 0;
6227 int end = INT_MAX;
6228 PyObject *result;
6229
Guido van Rossumb8872e62000-05-09 14:14:27 +00006230 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6231 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 return NULL;
6233 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6234 (PyObject *)substring);
6235 if (substring == NULL)
6236 return NULL;
6237
Guido van Rossum77f6a652002-04-03 22:41:51 +00006238 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239
6240 Py_DECREF(substring);
6241 return result;
6242}
6243
6244
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006245
6246static PyObject *
6247unicode_getnewargs(PyUnicodeObject *v)
6248{
6249 return Py_BuildValue("(u#)", v->str, v->length);
6250}
6251
6252
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253static PyMethodDef unicode_methods[] = {
6254
6255 /* Order is according to common usage: often used methods should
6256 appear first, since lookup is done sequentially. */
6257
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006258 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6259 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6260 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006261 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006262 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6263 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6264 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6265 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6266 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6267 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6268 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6269 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6270 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6271 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006272 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006273 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006274/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6275 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6276 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6277 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006278 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006279 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006280 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006281 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6282 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6283 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6284 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6285 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6286 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6287 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6288 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6289 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6290 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6291 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6292 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6293 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6294 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006295 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006296#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006297 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298#endif
6299
6300#if 0
6301 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006302 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303#endif
6304
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006305 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 {NULL, NULL}
6307};
6308
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006309static PyObject *
6310unicode_mod(PyObject *v, PyObject *w)
6311{
6312 if (!PyUnicode_Check(v)) {
6313 Py_INCREF(Py_NotImplemented);
6314 return Py_NotImplemented;
6315 }
6316 return PyUnicode_Format(v, w);
6317}
6318
6319static PyNumberMethods unicode_as_number = {
6320 0, /*nb_add*/
6321 0, /*nb_subtract*/
6322 0, /*nb_multiply*/
6323 0, /*nb_divide*/
6324 unicode_mod, /*nb_remainder*/
6325};
6326
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327static PySequenceMethods unicode_as_sequence = {
6328 (inquiry) unicode_length, /* sq_length */
6329 (binaryfunc) PyUnicode_Concat, /* sq_concat */
6330 (intargfunc) unicode_repeat, /* sq_repeat */
6331 (intargfunc) unicode_getitem, /* sq_item */
6332 (intintargfunc) unicode_slice, /* sq_slice */
6333 0, /* sq_ass_item */
6334 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00006335 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336};
6337
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006338static PyObject*
6339unicode_subscript(PyUnicodeObject* self, PyObject* item)
6340{
6341 if (PyInt_Check(item)) {
6342 long i = PyInt_AS_LONG(item);
6343 if (i < 0)
6344 i += PyString_GET_SIZE(self);
6345 return unicode_getitem(self, i);
6346 } else if (PyLong_Check(item)) {
6347 long i = PyLong_AsLong(item);
6348 if (i == -1 && PyErr_Occurred())
6349 return NULL;
6350 if (i < 0)
6351 i += PyString_GET_SIZE(self);
6352 return unicode_getitem(self, i);
6353 } else if (PySlice_Check(item)) {
6354 int start, stop, step, slicelength, cur, i;
6355 Py_UNICODE* source_buf;
6356 Py_UNICODE* result_buf;
6357 PyObject* result;
6358
6359 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
6360 &start, &stop, &step, &slicelength) < 0) {
6361 return NULL;
6362 }
6363
6364 if (slicelength <= 0) {
6365 return PyUnicode_FromUnicode(NULL, 0);
6366 } else {
6367 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6368 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
6369
6370 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6371 result_buf[i] = source_buf[cur];
6372 }
Tim Petersced69f82003-09-16 20:30:58 +00006373
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006374 result = PyUnicode_FromUnicode(result_buf, slicelength);
6375 PyMem_FREE(result_buf);
6376 return result;
6377 }
6378 } else {
6379 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6380 return NULL;
6381 }
6382}
6383
6384static PyMappingMethods unicode_as_mapping = {
6385 (inquiry)unicode_length, /* mp_length */
6386 (binaryfunc)unicode_subscript, /* mp_subscript */
6387 (objobjargproc)0, /* mp_ass_subscript */
6388};
6389
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390static int
6391unicode_buffer_getreadbuf(PyUnicodeObject *self,
6392 int index,
6393 const void **ptr)
6394{
6395 if (index != 0) {
6396 PyErr_SetString(PyExc_SystemError,
6397 "accessing non-existent unicode segment");
6398 return -1;
6399 }
6400 *ptr = (void *) self->str;
6401 return PyUnicode_GET_DATA_SIZE(self);
6402}
6403
6404static int
6405unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6406 const void **ptr)
6407{
6408 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006409 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 return -1;
6411}
6412
6413static int
6414unicode_buffer_getsegcount(PyUnicodeObject *self,
6415 int *lenp)
6416{
6417 if (lenp)
6418 *lenp = PyUnicode_GET_DATA_SIZE(self);
6419 return 1;
6420}
6421
6422static int
6423unicode_buffer_getcharbuf(PyUnicodeObject *self,
6424 int index,
6425 const void **ptr)
6426{
6427 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006428
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 if (index != 0) {
6430 PyErr_SetString(PyExc_SystemError,
6431 "accessing non-existent unicode segment");
6432 return -1;
6433 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006434 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435 if (str == NULL)
6436 return -1;
6437 *ptr = (void *) PyString_AS_STRING(str);
6438 return PyString_GET_SIZE(str);
6439}
6440
6441/* Helpers for PyUnicode_Format() */
6442
6443static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006444getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445{
6446 int argidx = *p_argidx;
6447 if (argidx < arglen) {
6448 (*p_argidx)++;
6449 if (arglen < 0)
6450 return args;
6451 else
6452 return PyTuple_GetItem(args, argidx);
6453 }
6454 PyErr_SetString(PyExc_TypeError,
6455 "not enough arguments for format string");
6456 return NULL;
6457}
6458
6459#define F_LJUST (1<<0)
6460#define F_SIGN (1<<1)
6461#define F_BLANK (1<<2)
6462#define F_ALT (1<<3)
6463#define F_ZERO (1<<4)
6464
6465static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467{
6468 register int i;
6469 int len;
6470 va_list va;
6471 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473
6474 /* First, format the string as char array, then expand to Py_UNICODE
6475 array. */
6476 charbuffer = (char *)buffer;
6477 len = vsprintf(charbuffer, format, va);
6478 for (i = len - 1; i >= 0; i--)
6479 buffer[i] = (Py_UNICODE) charbuffer[i];
6480
6481 va_end(va);
6482 return len;
6483}
6484
Guido van Rossum078151d2002-08-11 04:24:12 +00006485/* XXX To save some code duplication, formatfloat/long/int could have been
6486 shared with stringobject.c, converting from 8-bit to Unicode after the
6487 formatting is done. */
6488
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489static int
6490formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006491 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492 int flags,
6493 int prec,
6494 int type,
6495 PyObject *v)
6496{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006497 /* fmt = '%#.' + `prec` + `type`
6498 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 char fmt[20];
6500 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006501
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502 x = PyFloat_AsDouble(v);
6503 if (x == -1.0 && PyErr_Occurred())
6504 return -1;
6505 if (prec < 0)
6506 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6508 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006509 /* Worst case length calc to ensure no buffer overrun:
6510
6511 'g' formats:
6512 fmt = %#.<prec>g
6513 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6514 for any double rep.)
6515 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6516
6517 'f' formats:
6518 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6519 len = 1 + 50 + 1 + prec = 52 + prec
6520
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006521 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006522 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006523
6524 */
6525 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6526 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006527 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006528 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006529 return -1;
6530 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006531 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6532 (flags&F_ALT) ? "#" : "",
6533 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 return usprintf(buf, fmt, x);
6535}
6536
Tim Peters38fd5b62000-09-21 05:43:11 +00006537static PyObject*
6538formatlong(PyObject *val, int flags, int prec, int type)
6539{
6540 char *buf;
6541 int i, len;
6542 PyObject *str; /* temporary string object. */
6543 PyUnicodeObject *result;
6544
6545 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6546 if (!str)
6547 return NULL;
6548 result = _PyUnicode_New(len);
6549 for (i = 0; i < len; i++)
6550 result->str[i] = buf[i];
6551 result->str[len] = 0;
6552 Py_DECREF(str);
6553 return (PyObject*)result;
6554}
6555
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556static int
6557formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006558 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559 int flags,
6560 int prec,
6561 int type,
6562 PyObject *v)
6563{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006564 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006565 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6566 * + 1 + 1
6567 * = 24
6568 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006569 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006570 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571 long x;
6572
6573 x = PyInt_AsLong(v);
6574 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006575 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006576 if (x < 0 && type == 'u') {
6577 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006578 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006579 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6580 sign = "-";
6581 else
6582 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006584 prec = 1;
6585
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006586 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6587 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006588 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006589 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006590 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006591 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006592 return -1;
6593 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006594
6595 if ((flags & F_ALT) &&
6596 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006597 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006598 * of issues that cause pain:
6599 * - when 0 is being converted, the C standard leaves off
6600 * the '0x' or '0X', which is inconsistent with other
6601 * %#x/%#X conversions and inconsistent with Python's
6602 * hex() function
6603 * - there are platforms that violate the standard and
6604 * convert 0 with the '0x' or '0X'
6605 * (Metrowerks, Compaq Tru64)
6606 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006607 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006608 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006609 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006610 * We can achieve the desired consistency by inserting our
6611 * own '0x' or '0X' prefix, and substituting %x/%X in place
6612 * of %#x/%#X.
6613 *
6614 * Note that this is the same approach as used in
6615 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006616 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006617 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6618 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006619 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006620 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006621 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6622 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006623 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006624 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006625 if (sign[0])
6626 return usprintf(buf, fmt, -x);
6627 else
6628 return usprintf(buf, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629}
6630
6631static int
6632formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006633 size_t buflen,
6634 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006636 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006637 if (PyUnicode_Check(v)) {
6638 if (PyUnicode_GET_SIZE(v) != 1)
6639 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006643 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006644 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006645 goto onError;
6646 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6647 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648
6649 else {
6650 /* Integer input truncated to a character */
6651 long x;
6652 x = PyInt_AsLong(v);
6653 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006654 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006655#ifdef Py_UNICODE_WIDE
6656 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006657 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006658 "%c arg not in range(0x110000) "
6659 "(wide Python build)");
6660 return -1;
6661 }
6662#else
6663 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006664 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006665 "%c arg not in range(0x10000) "
6666 "(narrow Python build)");
6667 return -1;
6668 }
6669#endif
6670 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 }
6672 buf[1] = '\0';
6673 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006674
6675 onError:
6676 PyErr_SetString(PyExc_TypeError,
6677 "%c requires int or char");
6678 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679}
6680
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006681/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6682
6683 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6684 chars are formatted. XXX This is a magic number. Each formatting
6685 routine does bounds checking to ensure no overflow, but a better
6686 solution may be to malloc a buffer of appropriate size for each
6687 format. For now, the current solution is sufficient.
6688*/
6689#define FORMATBUFLEN (size_t)120
6690
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691PyObject *PyUnicode_Format(PyObject *format,
6692 PyObject *args)
6693{
6694 Py_UNICODE *fmt, *res;
6695 int fmtcnt, rescnt, reslen, arglen, argidx;
6696 int args_owned = 0;
6697 PyUnicodeObject *result = NULL;
6698 PyObject *dict = NULL;
6699 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006700
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 if (format == NULL || args == NULL) {
6702 PyErr_BadInternalCall();
6703 return NULL;
6704 }
6705 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006706 if (uformat == NULL)
6707 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 fmt = PyUnicode_AS_UNICODE(uformat);
6709 fmtcnt = PyUnicode_GET_SIZE(uformat);
6710
6711 reslen = rescnt = fmtcnt + 100;
6712 result = _PyUnicode_New(reslen);
6713 if (result == NULL)
6714 goto onError;
6715 res = PyUnicode_AS_UNICODE(result);
6716
6717 if (PyTuple_Check(args)) {
6718 arglen = PyTuple_Size(args);
6719 argidx = 0;
6720 }
6721 else {
6722 arglen = -1;
6723 argidx = -2;
6724 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006725 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6726 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727 dict = args;
6728
6729 while (--fmtcnt >= 0) {
6730 if (*fmt != '%') {
6731 if (--rescnt < 0) {
6732 rescnt = fmtcnt + 100;
6733 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006734 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 return NULL;
6736 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6737 --rescnt;
6738 }
6739 *res++ = *fmt++;
6740 }
6741 else {
6742 /* Got a format specifier */
6743 int flags = 0;
6744 int width = -1;
6745 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746 Py_UNICODE c = '\0';
6747 Py_UNICODE fill;
6748 PyObject *v = NULL;
6749 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006750 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 Py_UNICODE sign;
6752 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006753 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754
6755 fmt++;
6756 if (*fmt == '(') {
6757 Py_UNICODE *keystart;
6758 int keylen;
6759 PyObject *key;
6760 int pcount = 1;
6761
6762 if (dict == NULL) {
6763 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006764 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765 goto onError;
6766 }
6767 ++fmt;
6768 --fmtcnt;
6769 keystart = fmt;
6770 /* Skip over balanced parentheses */
6771 while (pcount > 0 && --fmtcnt >= 0) {
6772 if (*fmt == ')')
6773 --pcount;
6774 else if (*fmt == '(')
6775 ++pcount;
6776 fmt++;
6777 }
6778 keylen = fmt - keystart - 1;
6779 if (fmtcnt < 0 || pcount > 0) {
6780 PyErr_SetString(PyExc_ValueError,
6781 "incomplete format key");
6782 goto onError;
6783 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006784#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006785 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 then looked up since Python uses strings to hold
6787 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006788 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789 key = PyUnicode_EncodeUTF8(keystart,
6790 keylen,
6791 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006792#else
6793 key = PyUnicode_FromUnicode(keystart, keylen);
6794#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 if (key == NULL)
6796 goto onError;
6797 if (args_owned) {
6798 Py_DECREF(args);
6799 args_owned = 0;
6800 }
6801 args = PyObject_GetItem(dict, key);
6802 Py_DECREF(key);
6803 if (args == NULL) {
6804 goto onError;
6805 }
6806 args_owned = 1;
6807 arglen = -1;
6808 argidx = -2;
6809 }
6810 while (--fmtcnt >= 0) {
6811 switch (c = *fmt++) {
6812 case '-': flags |= F_LJUST; continue;
6813 case '+': flags |= F_SIGN; continue;
6814 case ' ': flags |= F_BLANK; continue;
6815 case '#': flags |= F_ALT; continue;
6816 case '0': flags |= F_ZERO; continue;
6817 }
6818 break;
6819 }
6820 if (c == '*') {
6821 v = getnextarg(args, arglen, &argidx);
6822 if (v == NULL)
6823 goto onError;
6824 if (!PyInt_Check(v)) {
6825 PyErr_SetString(PyExc_TypeError,
6826 "* wants int");
6827 goto onError;
6828 }
6829 width = PyInt_AsLong(v);
6830 if (width < 0) {
6831 flags |= F_LJUST;
6832 width = -width;
6833 }
6834 if (--fmtcnt >= 0)
6835 c = *fmt++;
6836 }
6837 else if (c >= '0' && c <= '9') {
6838 width = c - '0';
6839 while (--fmtcnt >= 0) {
6840 c = *fmt++;
6841 if (c < '0' || c > '9')
6842 break;
6843 if ((width*10) / 10 != width) {
6844 PyErr_SetString(PyExc_ValueError,
6845 "width too big");
6846 goto onError;
6847 }
6848 width = width*10 + (c - '0');
6849 }
6850 }
6851 if (c == '.') {
6852 prec = 0;
6853 if (--fmtcnt >= 0)
6854 c = *fmt++;
6855 if (c == '*') {
6856 v = getnextarg(args, arglen, &argidx);
6857 if (v == NULL)
6858 goto onError;
6859 if (!PyInt_Check(v)) {
6860 PyErr_SetString(PyExc_TypeError,
6861 "* wants int");
6862 goto onError;
6863 }
6864 prec = PyInt_AsLong(v);
6865 if (prec < 0)
6866 prec = 0;
6867 if (--fmtcnt >= 0)
6868 c = *fmt++;
6869 }
6870 else if (c >= '0' && c <= '9') {
6871 prec = c - '0';
6872 while (--fmtcnt >= 0) {
6873 c = Py_CHARMASK(*fmt++);
6874 if (c < '0' || c > '9')
6875 break;
6876 if ((prec*10) / 10 != prec) {
6877 PyErr_SetString(PyExc_ValueError,
6878 "prec too big");
6879 goto onError;
6880 }
6881 prec = prec*10 + (c - '0');
6882 }
6883 }
6884 } /* prec */
6885 if (fmtcnt >= 0) {
6886 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 if (--fmtcnt >= 0)
6888 c = *fmt++;
6889 }
6890 }
6891 if (fmtcnt < 0) {
6892 PyErr_SetString(PyExc_ValueError,
6893 "incomplete format");
6894 goto onError;
6895 }
6896 if (c != '%') {
6897 v = getnextarg(args, arglen, &argidx);
6898 if (v == NULL)
6899 goto onError;
6900 }
6901 sign = 0;
6902 fill = ' ';
6903 switch (c) {
6904
6905 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006906 pbuf = formatbuf;
6907 /* presume that buffer length is at least 1 */
6908 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 len = 1;
6910 break;
6911
6912 case 's':
6913 case 'r':
6914 if (PyUnicode_Check(v) && c == 's') {
6915 temp = v;
6916 Py_INCREF(temp);
6917 }
6918 else {
6919 PyObject *unicode;
6920 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006921 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922 else
6923 temp = PyObject_Repr(v);
6924 if (temp == NULL)
6925 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006926 if (PyUnicode_Check(temp))
6927 /* nothing to do */;
6928 else if (PyString_Check(temp)) {
6929 /* convert to string to Unicode */
Fred Drakee4315f52000-05-09 19:53:39 +00006930 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006932 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933 "strict");
6934 Py_DECREF(temp);
6935 temp = unicode;
6936 if (temp == NULL)
6937 goto onError;
6938 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006939 else {
6940 Py_DECREF(temp);
6941 PyErr_SetString(PyExc_TypeError,
6942 "%s argument has non-string str()");
6943 goto onError;
6944 }
6945 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006946 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 len = PyUnicode_GET_SIZE(temp);
6948 if (prec >= 0 && len > prec)
6949 len = prec;
6950 break;
6951
6952 case 'i':
6953 case 'd':
6954 case 'u':
6955 case 'o':
6956 case 'x':
6957 case 'X':
6958 if (c == 'i')
6959 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006960 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006961 temp = formatlong(v, flags, prec, c);
6962 if (!temp)
6963 goto onError;
6964 pbuf = PyUnicode_AS_UNICODE(temp);
6965 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00006966 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006968 else {
6969 pbuf = formatbuf;
6970 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6971 flags, prec, c, v);
6972 if (len < 0)
6973 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006974 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006975 }
6976 if (flags & F_ZERO)
6977 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978 break;
6979
6980 case 'e':
6981 case 'E':
6982 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006983 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984 case 'g':
6985 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006986 if (c == 'F')
6987 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006988 pbuf = formatbuf;
6989 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6990 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991 if (len < 0)
6992 goto onError;
6993 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006994 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 fill = '0';
6996 break;
6997
6998 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006999 pbuf = formatbuf;
7000 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 if (len < 0)
7002 goto onError;
7003 break;
7004
7005 default:
7006 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007007 "unsupported format character '%c' (0x%x) "
7008 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007009 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007010 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007011 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012 goto onError;
7013 }
7014 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007015 if (*pbuf == '-' || *pbuf == '+') {
7016 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017 len--;
7018 }
7019 else if (flags & F_SIGN)
7020 sign = '+';
7021 else if (flags & F_BLANK)
7022 sign = ' ';
7023 else
7024 sign = 0;
7025 }
7026 if (width < len)
7027 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007028 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029 reslen -= rescnt;
7030 rescnt = width + fmtcnt + 100;
7031 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007032 if (reslen < 0) {
7033 Py_DECREF(result);
7034 return PyErr_NoMemory();
7035 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007036 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037 return NULL;
7038 res = PyUnicode_AS_UNICODE(result)
7039 + reslen - rescnt;
7040 }
7041 if (sign) {
7042 if (fill != ' ')
7043 *res++ = sign;
7044 rescnt--;
7045 if (width > len)
7046 width--;
7047 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007048 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7049 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007050 assert(pbuf[1] == c);
7051 if (fill != ' ') {
7052 *res++ = *pbuf++;
7053 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007054 }
Tim Petersfff53252001-04-12 18:38:48 +00007055 rescnt -= 2;
7056 width -= 2;
7057 if (width < 0)
7058 width = 0;
7059 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007060 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061 if (width > len && !(flags & F_LJUST)) {
7062 do {
7063 --rescnt;
7064 *res++ = fill;
7065 } while (--width > len);
7066 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007067 if (fill == ' ') {
7068 if (sign)
7069 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007070 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007071 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007072 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007073 *res++ = *pbuf++;
7074 *res++ = *pbuf++;
7075 }
7076 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007077 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078 res += len;
7079 rescnt -= len;
7080 while (--width >= len) {
7081 --rescnt;
7082 *res++ = ' ';
7083 }
7084 if (dict && (argidx < arglen) && c != '%') {
7085 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007086 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087 goto onError;
7088 }
7089 Py_XDECREF(temp);
7090 } /* '%' */
7091 } /* until end */
7092 if (argidx < arglen && !dict) {
7093 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007094 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095 goto onError;
7096 }
7097
7098 if (args_owned) {
7099 Py_DECREF(args);
7100 }
7101 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00007102 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007103 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104 return (PyObject *)result;
7105
7106 onError:
7107 Py_XDECREF(result);
7108 Py_DECREF(uformat);
7109 if (args_owned) {
7110 Py_DECREF(args);
7111 }
7112 return NULL;
7113}
7114
7115static PyBufferProcs unicode_as_buffer = {
7116 (getreadbufferproc) unicode_buffer_getreadbuf,
7117 (getwritebufferproc) unicode_buffer_getwritebuf,
7118 (getsegcountproc) unicode_buffer_getsegcount,
7119 (getcharbufferproc) unicode_buffer_getcharbuf,
7120};
7121
Jeremy Hylton938ace62002-07-17 16:30:39 +00007122static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007123unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7124
Tim Peters6d6c1a32001-08-02 04:15:00 +00007125static PyObject *
7126unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7127{
7128 PyObject *x = NULL;
7129 static char *kwlist[] = {"string", "encoding", "errors", 0};
7130 char *encoding = NULL;
7131 char *errors = NULL;
7132
Guido van Rossume023fe02001-08-30 03:12:59 +00007133 if (type != &PyUnicode_Type)
7134 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007135 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7136 kwlist, &x, &encoding, &errors))
7137 return NULL;
7138 if (x == NULL)
7139 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007140 if (encoding == NULL && errors == NULL)
7141 return PyObject_Unicode(x);
7142 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007143 return PyUnicode_FromEncodedObject(x, encoding, errors);
7144}
7145
Guido van Rossume023fe02001-08-30 03:12:59 +00007146static PyObject *
7147unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7148{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007149 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007150 int n;
7151
7152 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7153 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7154 if (tmp == NULL)
7155 return NULL;
7156 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007157 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007158 if (pnew == NULL) {
7159 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007160 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007161 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007162 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7163 if (pnew->str == NULL) {
7164 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007165 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007166 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007167 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007168 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007169 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7170 pnew->length = n;
7171 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007172 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007173 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007174}
7175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007176PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007177"unicode(string [, encoding[, errors]]) -> object\n\
7178\n\
7179Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007180encoding defaults to the current default string encoding.\n\
7181errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007182
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183PyTypeObject PyUnicode_Type = {
7184 PyObject_HEAD_INIT(&PyType_Type)
7185 0, /* ob_size */
7186 "unicode", /* tp_name */
7187 sizeof(PyUnicodeObject), /* tp_size */
7188 0, /* tp_itemsize */
7189 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007190 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007192 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193 0, /* tp_setattr */
7194 (cmpfunc) unicode_compare, /* tp_compare */
7195 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007196 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007198 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199 (hashfunc) unicode_hash, /* tp_hash*/
7200 0, /* tp_call*/
7201 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007202 PyObject_GenericGetAttr, /* tp_getattro */
7203 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007205 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7206 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007207 unicode_doc, /* tp_doc */
7208 0, /* tp_traverse */
7209 0, /* tp_clear */
7210 0, /* tp_richcompare */
7211 0, /* tp_weaklistoffset */
7212 0, /* tp_iter */
7213 0, /* tp_iternext */
7214 unicode_methods, /* tp_methods */
7215 0, /* tp_members */
7216 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007217 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007218 0, /* tp_dict */
7219 0, /* tp_descr_get */
7220 0, /* tp_descr_set */
7221 0, /* tp_dictoffset */
7222 0, /* tp_init */
7223 0, /* tp_alloc */
7224 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007225 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226};
7227
7228/* Initialize the Unicode implementation */
7229
Thomas Wouters78890102000-07-22 19:25:51 +00007230void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007232 int i;
7233
Fred Drakee4315f52000-05-09 19:53:39 +00007234 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007235 unicode_freelist = NULL;
7236 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007238 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007239 for (i = 0; i < 256; i++)
7240 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007241 if (PyType_Ready(&PyUnicode_Type) < 0)
7242 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243}
7244
7245/* Finalize the Unicode implementation */
7246
7247void
Thomas Wouters78890102000-07-22 19:25:51 +00007248_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007250 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007251 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007253 Py_XDECREF(unicode_empty);
7254 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007256 for (i = 0; i < 256; i++) {
7257 if (unicode_latin1[i]) {
7258 Py_DECREF(unicode_latin1[i]);
7259 unicode_latin1[i] = NULL;
7260 }
7261 }
7262
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007263 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264 PyUnicodeObject *v = u;
7265 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007266 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007267 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007268 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007269 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007271 unicode_freelist = NULL;
7272 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007274
7275/*
7276Local variables:
7277c-basic-offset: 4
7278indent-tabs-mode: nil
7279End:
7280*/